Exemplo n.º 1
0
def test_preprocess_data_weighted():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    # XXX: if normalize=True, should we expect a weighted standard deviation?
    #      Currently not weighted, but calculated with respect to weighted mean
    expected_X_norm = (np.sqrt(X.shape[0]) *
                       np.mean((X - expected_X_mean) ** 2, axis=0) ** .5)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
Exemplo n.º 2
0
def test_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    expected_X_mean = np.mean(X, axis=0)
    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
    expected_y_mean = np.mean(y, axis=0)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
Exemplo n.º 3
0
def test_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    expected_X_mean = np.mean(X, axis=0)
    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
    expected_y_mean = np.mean(y, axis=0)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
Exemplo n.º 4
0
def test_preprocess_data_multioutput():
    n_samples = 200
    n_features = 3
    n_outputs = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples, n_outputs)
    expected_y_mean = np.mean(y, axis=0)

    args = [X, sparse.csc_matrix(X)]
    for X in args:
        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=False,
                                               normalize=False)
        assert_array_almost_equal(y_mean, np.zeros(n_outputs))
        assert_array_almost_equal(yt, y)

        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=True,
                                               normalize=False)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)

        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=True,
                                               normalize=True)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)
Exemplo n.º 5
0
def test_preprocess_data_weighted():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    # XXX: if normalize=True, should we expect a weighted standard deviation?
    #      Currently not weighted, but calculated with respect to weighted mean
    expected_X_norm = (np.sqrt(X.shape[0]) * np.mean(
        (X - expected_X_mean)**2, axis=0)**.5)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
Exemplo n.º 6
0
 def preprocess_data(self, X, y):
     # ensure that we fit the intercept by hand, but normalize if desired
     return _preprocess_data(X,
                             y,
                             fit_intercept=False,
                             normalize=self.normalize,
                             copy=self.copy_X)
Exemplo n.º 7
0
def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert_equal(csr_.getformat(), 'csr')
Exemplo n.º 8
0
def test_randomized_logistic_sparse():
    # Check randomized sparse logistic regression on sparse data
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    # center here because sparse matrices are usually not centered
    # labels should not be centered
    X, _, _, _, _ = _preprocess_data(X, y, True, True)

    X_sp = sparse.csr_matrix(X)

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False,
                                       C=1.,
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    clf = RandomizedLogisticRegression(verbose=False,
                                       C=1.,
                                       random_state=42,
                                       scaling=scaling,
                                       n_resampling=50,
                                       tol=1e-3)
    feature_scores_sp = clf.fit(X_sp, y).scores_
    assert_array_equal(feature_scores, feature_scores_sp)
Exemplo n.º 9
0
def test_randomized_logistic_sparse():
    # Check randomized sparse logistic regression on sparse data
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    # center here because sparse matrices are usually not centered
    # labels should not be centered
    X, _, _, _, _ = _preprocess_data(X, y, True, True)

    X_sp = sparse.csr_matrix(X)

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores_sp = clf.fit(X_sp, y).scores_
    assert_array_equal(feature_scores, feature_scores_sp)
Exemplo n.º 10
0
def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert_equal(csr_.getformat(), 'csr')
Exemplo n.º 11
0
 def _preprocess_data(self, X, y, fit_intercept, normalize=False):
     """Center the data in X but not in y"""
     X, _, X_offset, _, X_scale = _preprocess_data(X,
                                                   y,
                                                   fit_intercept,
                                                   normalize=normalize)
     return X, y, X_offset, y, X_scale
Exemplo n.º 12
0
def test_sparse_preprocess_data_with_return_mean():
    n_samples = 200
    n_features = 2
    # random_state not supported yet in sparse.rand
    X = sparse.rand(n_samples, n_features, density=.5)  # , random_state=rng
    X = X.tolil()
    y = rng.rand(n_samples)
    XA = X.toarray()
    expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0])

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt.A, XA / expected_X_norm)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))
Exemplo n.º 13
0
def test_sparse_preprocess_data_with_return_mean():
    n_samples = 200
    n_features = 2
    # random_state not supported yet in sparse.rand
    X = sparse.rand(n_samples, n_features, density=.5)  # , random_state=rng
    X = X.tolil()
    y = rng.rand(n_samples)
    XA = X.toarray()
    expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0])

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt.A, XA / expected_X_norm)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))
Exemplo n.º 14
0
def test_preprocess_data_multioutput():
    n_samples = 200
    n_features = 3
    n_outputs = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples, n_outputs)
    expected_y_mean = np.mean(y, axis=0)

    args = [X, sparse.csc_matrix(X)]
    for X in args:
        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=False,
                                               normalize=False)
        assert_array_almost_equal(y_mean, np.zeros(n_outputs))
        assert_array_almost_equal(yt, y)

        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True,
                                               normalize=False)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)

        _, yt, _, y_mean, _ = _preprocess_data(X, y, fit_intercept=True,
                                               normalize=True)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)
Exemplo n.º 15
0
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X, y, True,
                                       copy=to_copy, check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)
Exemplo n.º 16
0
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X,
                                       y,
                                       True,
                                       copy=to_copy,
                                       check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)
Exemplo n.º 17
0
def test_dtype_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    X_32 = np.asarray(X, dtype=np.float32)
    y_32 = np.asarray(y, dtype=np.float32)
    X_64 = np.asarray(X, dtype=np.float64)
    y_64 = np.asarray(y, dtype=np.float64)

    for fit_intercept in [True, False]:
        for normalize in [True, False]:

            Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
                X_32,
                y_32,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True)

            Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
                X_64,
                y_64,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True)

            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = (
                _preprocess_data(X_32,
                                 y_64,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize,
                                 return_mean=True))

            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = (
                _preprocess_data(X_64,
                                 y_32,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize,
                                 return_mean=True))

            assert_equal(Xt_32.dtype, np.float32)
            assert_equal(yt_32.dtype, np.float32)
            assert_equal(X_mean_32.dtype, np.float32)
            assert_equal(y_mean_32.dtype, np.float32)
            assert_equal(X_norm_32.dtype, np.float32)

            assert_equal(Xt_64.dtype, np.float64)
            assert_equal(yt_64.dtype, np.float64)
            assert_equal(X_mean_64.dtype, np.float64)
            assert_equal(y_mean_64.dtype, np.float64)
            assert_equal(X_norm_64.dtype, np.float64)

            assert_equal(Xt_3264.dtype, np.float32)
            assert_equal(yt_3264.dtype, np.float32)
            assert_equal(X_mean_3264.dtype, np.float32)
            assert_equal(y_mean_3264.dtype, np.float32)
            assert_equal(X_norm_3264.dtype, np.float32)

            assert_equal(Xt_6432.dtype, np.float64)
            assert_equal(yt_6432.dtype, np.float64)
            assert_equal(X_mean_6432.dtype, np.float64)
            assert_equal(y_mean_6432.dtype, np.float64)
            assert_equal(X_norm_6432.dtype, np.float64)

            assert_equal(X_32.dtype, np.float32)
            assert_equal(y_32.dtype, np.float32)
            assert_equal(X_64.dtype, np.float64)
            assert_equal(y_64.dtype, np.float64)

            assert_array_almost_equal(Xt_32, Xt_64)
            assert_array_almost_equal(yt_32, yt_64)
            assert_array_almost_equal(X_mean_32, X_mean_64)
            assert_array_almost_equal(y_mean_32, y_mean_64)
            assert_array_almost_equal(X_norm_32, X_norm_64)
Exemplo n.º 18
0
def _alpha_grid(X, y, Xy=None, l1_ratio=1.0, fit_intercept=True,
                eps=1e-3, n_alphas=100, normalize=False, copy_X=True):
    """ Compute the grid of alpha values for elastic net parameter search
    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data. Pass directly as Fortran-contiguous data to avoid
        unnecessary memory duplication
    y : ndarray, shape (n_samples,)
        Target values
    Xy : array-like, optional
        Xy = np.dot(X.T, y) that can be precomputed.
    l1_ratio : float
        The elastic net mixing parameter, with ``0 < l1_ratio <= 1``.
        For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not
        supported) ``For l1_ratio = 1`` it is an L1 penalty. For
        ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2.
    eps : float, optional
        Length of the path. ``eps=1e-3`` means that
        ``alpha_min / alpha_max = 1e-3``
    n_alphas : int, optional
        Number of alphas along the regularization path
    fit_intercept : boolean, default True
        Whether to fit an intercept or not
    normalize : boolean, optional, default False
        If ``True``, the regressors X will be normalized before regression.
        This parameter is ignored when ``fit_intercept`` is set to ``False``.
        When the regressors are normalized, note that this makes the
        hyperparameters learnt more robust and almost independent of the number
        of samples. The same property is not valid for standardized data.
        However, if you wish to standardize, please use
        :class:`preprocessing.StandardScaler` before calling ``fit`` on an estimator
        with ``normalize=False``.
    copy_X : boolean, optional, default True
        If ``True``, X will be copied; else, it may be overwritten.
    """
    if l1_ratio == 0:
        raise ValueError("Automatic alpha grid generation is not supported for"
                         " l1_ratio=0. Please supply a grid by providing "
                         "your estimator with the appropriate `alphas=` "
                         "argument.")
    n_samples = len(y)

    sparse_center = False
    if Xy is None:
        X_sparse = sparse.isspmatrix(X)
        sparse_center = X_sparse and (fit_intercept or normalize)
        X = check_array(X, 'csc',
                        copy=(copy_X and fit_intercept and not X_sparse))
        if not X_sparse:
            # X can be touched inplace thanks to the above line
            X, y, _, _, _ = _preprocess_data(X, y, fit_intercept,
                                             normalize, copy=False)
        Xy = safe_sparse_dot(X.T, y, dense_output=True)

        if sparse_center:
            # Workaround to find alpha_max for sparse matrices.
            # since we should not destroy the sparsity of such matrices.
            _, _, X_offset, _, X_scale = _preprocess_data(X, y, fit_intercept,
                                                      normalize,
                                                      return_mean=True)
            mean_dot = X_offset * np.sum(y)

    if Xy.ndim == 1:
        Xy = Xy[:, np.newaxis]

    if sparse_center:
        if fit_intercept:
            Xy -= mean_dot[:, np.newaxis]
        if normalize:
            Xy /= X_scale[:, np.newaxis]

    alpha_max = (np.sqrt(np.sum(Xy ** 2, axis=1)).max() /
                 (n_samples * l1_ratio))

    if alpha_max <= np.finfo(float).resolution:
        alphas = np.empty(n_alphas)
        alphas.fill(np.finfo(float).resolution)
        return alphas

    return np.logspace(np.log10(alpha_max * eps), np.log10(alpha_max),
                       num=n_alphas)[::-1]
Exemplo n.º 19
0
    def fit(self, X, y, seed=None, verbose=False, sample_weight=None):
        """Fit data according to the UoI-Lasso algorithm.
		Relevant information (fits, residuals, model performance) is stored within object.
		Thus, nothing is returned by this function.

		Parameters
		----------
		X : np array (2d)
			the design matrix, containing the predictors.
			its shape is assumed to be (number of samples, number of features).

		y : np array (1d)
			the vector of dependent variables.
			its length is assumed to be (number of samples,).

		seed : int
			a seed for the random number generator. this number is relevant
			for the choosing bootstraps and dividing the data into training and test sets.

		verbose : boolean
			a boolean switch indicating whether the fitting should print out its progress.
		"""
        # initialize the seed, if it's provided
        if seed is not None:
            np.random.seed(seed)

        # start taken from sklearn.LinearModels.base.LinearRegression
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        # preprocess data through centering and normalization
        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        # extract model dimensions from design matrix
        self.n_samples_, self.n_features_ = X.shape

        if verbose:
            print('(1) Loaded data.\n %s samples with %s features.' %
                  (self.n_samples_, self.n_features_))

        # perform an initial coarse sweep over the lambda parameters
        # this is to zero-in on the relevant regularization region.
        if self.n_lambdas == 1:
            lambda_coarse = np.array([1.0])
        else:
            lambda_coarse = np.logspace(-3.,
                                        3.,
                                        self.n_lambdas,
                                        dtype=np.float64)
        # run the coarse lasso sweep
        estimates_coarse, scores_coarse = \
         self.lasso_sweep(
          X, y, lambda_coarse, self.train_frac_sel, self.n_boots_coarse,
          self.use_admm, desc='coarse lasso sweep', verbose=verbose
         )
        # deduce the index which maximizes the explained variance over bootstraps
        lambda_max_idx = np.argmax(np.mean(scores_coarse, axis=0))
        # obtain the lambda which maximizes the explained variance over bootstraps
        lambda_max = lambda_coarse[lambda_max_idx]
        # in our dense sweep, we'll explore lambda values which encompass a
        # range that's one order of magnitude less than lambda_max itself
        d_lambda = 10**(np.floor(np.log10(lambda_max)) - 1)

        # now that we've narrowed down the regularization parameters,
        # we'll run a dense sweep which begins the model selection module of UoI

        #######################
        ### Model Selection ###
        #######################
        if verbose:
            print(
                '(2) Beginning model selection. Exploring penalty region centered at %d.'
                % lambda_max)

        # create the final lambda set based on the coarse sweep
        if self.n_lambdas == 1:
            lambdas = np.array([lambda_max])
        else:
            lambdas = np.linspace(lambda_max - 5 * d_lambda,
                                  lambda_max + 5 * d_lambda,
                                  self.n_lambdas,
                                  dtype=np.float64)
        # run the lasso sweep with new lambda set
        estimates_dense, scores_dense = \
         self.lasso_sweep(
          X, y, lambdas, self.train_frac_sel, self.n_boots_sel,
          self.use_admm, desc='fine lasso sweep', verbose=verbose
         )
        # choose selection fraction threshold values to use
        selection_frac_thresholds = np.linspace(self.selection_thres_min,
                                                self.selection_thres_max,
                                                self.n_selection_thres)
        # calculate the actual number of thresholds, but delete any repetitions
        selection_thresholds = np.sort(
            np.unique(
                (self.n_boots_sel * selection_frac_thresholds).astype('int')))
        # create support matrix
        self.supports_ = np.zeros(
            (self.n_selection_thres, self.n_lambdas, self.n_features_),
            dtype=bool)
        # iterate over each stability selection threshold
        for thres_idx, threshold in enumerate(selection_thresholds):
            # calculate the support given the specific selection threshold
            self.supports_[thres_idx, :] = np.count_nonzero(
                estimates_dense, axis=0) >= threshold
        # reshape support matrix so that first axis consists of all combinations of hyperparameters
        self.supports_ = np.reshape(
            self.supports_,
            (self.n_selection_thres * self.n_lambdas, self.n_features_))

        ########################
        ### Model Estimation ###
        ########################
        # we'll use the supports obtained in the selection module to calculate
        # bagged OLS estimates over bootstraps

        if verbose:
            print(
                '(3) Model selection complete. Beginning model estimation, with %s bootstraps.'
                % self.n_boots_est)

        # create or overwrite arrays to collect final results
        self.coef_ = np.zeros(self.n_features_, dtype=np.float32)
        self.scores_ = np.zeros(1, dtype=np.float32)
        # determine how many samples will be used for overall training
        train_split = int(round(self.train_frac_overall * self.n_samples_))
        # determine how many samples will be used for training within a bootstrap
        boot_train_split = int(round(self.train_frac_est * train_split))

        # set up data arrays
        estimates = np.zeros(
            (self.n_boots_est, self.n_lambdas, self.n_features_),
            dtype=np.float32)
        scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32)
        # either we plan on using a test set, or we'll use the entire dataset for training
        if self.train_frac_overall < 1:
            # generate indices for the global training and testing blocks
            indices = np.random.permutation(self.n_samples_)
            train, test = np.split(indices, [train_split])
            # compile the training and test sets
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
        else:
            X_train = X
            y_train = y

        # iterate over bootstrap samples
        for bootstrap in trange(self.n_boots_est,
                                desc='Model Estimation',
                                disable=not verbose):
            # extract the bootstrap indices, keeping a fraction of the data
            # available for testing
            bootstrap_indices = np.random.permutation(train_split)
            train_boot, test_boot = np.split(bootstrap_indices,
                                             [boot_train_split])
            # iterate over the regularization parameters
            for lamb_idx, lamb in enumerate(lambdas):
                support = self.supports_[lamb_idx]
                if np.any(support):
                    # fit OLS using the supports from selection module
                    X_boot = X_train[train_boot]
                    y_boot = y_train[train_boot]
                    ols = lm.LinearRegression()
                    ols.fit(X_boot[:, support], y_boot - y_boot.mean())
                    # store the fitted coefficients
                    estimates[bootstrap, lamb_idx, support] = ols.coef_
                    # calculate and store the performance on the test set
                    y_hat_boot = np.dot(X_train[test_boot],
                                        estimates[bootstrap, lamb_idx, :])
                    y_true_boot = y_train[test_boot] - y_train[test_boot].mean(
                    )
                    # calculate sum of squared residuals
                    rss = np.sum((y_hat_boot - y_true_boot)**2)
                    # calculate BIC as our scoring function
                    if self.estimation_score == 'r2':
                        scores[bootstrap,
                               lamb_idx] = r2_score(y_true_boot, y_hat_boot)
                    elif self.estimation_score == 'BIC':
                        n_selected_features = np.count_nonzero(support)
                        scores[bootstrap, lamb_idx] = -utils.BIC(
                            n_features=n_selected_features,
                            n_samples=boot_train_split,
                            rss=rss)
                else:
                    # if no variables were selected, throw a message
                    # we'll leave the scores array unchanged, so any support
                    # with no selection will be assigned a score of 0.
                    print(
                        'No variables selected in the support for lambda = %d.'
                        % lamb)

        if verbose:
            print('(4) Bagging estimates, using bagging option %s.' %
                  self.bagging_options)

        if self.bagging_options == 1:
            # bagging option 1: for each bootstrap sample, find the regularization parameter that gave the best results
            lambda_max_idx = np.argmax(scores, axis=1)
            # extract the estimates over bootstraps from the model with best lambda
            best_estimates = estimates[np.arange(self.n_boots_est),
                                       lambda_max_idx, :]
            # take the median across estimates for the final, bagged estimate
            self.coef_ = np.median(best_estimates, axis=0)
        elif self.bagging_options == 2:
            # bagging option 2: average estimates across bootstraps, and then find the regularization parameter that gives the best results
            mean_scores = np.mean(scores, axis=0)
            lambda_max_idx = np.argmax(mean_scores)
            self.coef_ = np.median(estimates[:, lambda_max_idx, :], 0)
        else:
            raise ValueError('Bagging option %d is not available.' %
                             self.bagging_options)
        # if we extracted a test set, evaluate the model
        if self.train_frac_overall < 1:
            # finally, see how the bagged estimates perform on the test set
            y_hat = np.dot(X_test, self.coef_)
            y_true = y_test - y_test.mean()
            # calculate and store performance of the final UoI_Lasso estimator over test set
            self.scores_ = r2_score(y_true, y_hat)
        else:
            self.scores_ = None

        if verbose:
            print("---> UoI Lasso complete.")

        if y.ndim == 1:
            self.coef_ = np.ravel(self.coef_)
        self._set_intercept(X_offset, y_offset, X_scale)

        return self
Exemplo n.º 20
0
    def fit(self, X, y, init=None):
        """Fit the Poisson GLM with coordinate descent.

        Parameters
        ----------
        X : nd-array, shape (n_samples, n_features)
            The design matrix.

        y : nd-array, shape (n_samples,)
            Response vector. Will be cast to X's dtype if necessary.
            Currently, this implementation does not handle multiple response
            variables.

        init : nd-array, shape (n_features)
            Initialization for parameters.
        """
        self.n_samples, self.n_features = X.shape

        # initialization
        if self.warm_start and hasattr(self, 'coef_'):
            coef = self.coef_
        else:
            coef = np.zeros(shape=(self.n_features))

        if init is not None:
            coef = init

        intercept = 0

        # we will handle the intercept by hand: only preprocess the design
        # matrix
        X, _, X_offset, _, X_scale = _preprocess_data(X,
                                                      y,
                                                      fit_intercept=False,
                                                      normalize=self.normalize)

        # all features are initially active
        active_idx = np.arange(self.n_features)

        coef_update = np.zeros(coef.shape)
        # perform coordinate descent updates
        for iteration in range(self.max_iter):

            # linearize the log-likelihood
            w, z = self.adjusted_response(X, y, coef, intercept)

            # perform an update of coordinate descent
            coef_update, intercept = self.cd_sweep(coef=coef,
                                                   intercept=intercept,
                                                   X=X,
                                                   w=w,
                                                   z=z,
                                                   active_idx=active_idx)

            # check convergence
            if np.max(np.abs(coef_update - coef)) < self.tol:
                break

            coef = coef_update

            # update the active features
            active_idx = np.argwhere(coef != 0).ravel()

        self.intercept_ = intercept
        self.coef_ = coef_update / X_scale
Exemplo n.º 21
0
def test_dtype_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    X_32 = np.asarray(X, dtype=np.float32)
    y_32 = np.asarray(y, dtype=np.float32)
    X_64 = np.asarray(X, dtype=np.float64)
    y_64 = np.asarray(y, dtype=np.float64)

    for fit_intercept in [True, False]:
        for normalize in [True, False]:

            Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
                X_32, y_32, fit_intercept=fit_intercept, normalize=normalize,
                return_mean=True)

            Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
                X_64, y_64, fit_intercept=fit_intercept, normalize=normalize,
                return_mean=True)

            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = (
                _preprocess_data(X_32, y_64, fit_intercept=fit_intercept,
                                 normalize=normalize, return_mean=True))

            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = (
                _preprocess_data(X_64, y_32, fit_intercept=fit_intercept,
                                 normalize=normalize, return_mean=True))

            assert_equal(Xt_32.dtype, np.float32)
            assert_equal(yt_32.dtype, np.float32)
            assert_equal(X_mean_32.dtype, np.float32)
            assert_equal(y_mean_32.dtype, np.float32)
            assert_equal(X_norm_32.dtype, np.float32)

            assert_equal(Xt_64.dtype, np.float64)
            assert_equal(yt_64.dtype, np.float64)
            assert_equal(X_mean_64.dtype, np.float64)
            assert_equal(y_mean_64.dtype, np.float64)
            assert_equal(X_norm_64.dtype, np.float64)

            assert_equal(Xt_3264.dtype, np.float32)
            assert_equal(yt_3264.dtype, np.float32)
            assert_equal(X_mean_3264.dtype, np.float32)
            assert_equal(y_mean_3264.dtype, np.float32)
            assert_equal(X_norm_3264.dtype, np.float32)

            assert_equal(Xt_6432.dtype, np.float64)
            assert_equal(yt_6432.dtype, np.float64)
            assert_equal(X_mean_6432.dtype, np.float64)
            assert_equal(y_mean_6432.dtype, np.float64)
            assert_equal(X_norm_6432.dtype, np.float64)

            assert_equal(X_32.dtype, np.float32)
            assert_equal(y_32.dtype, np.float32)
            assert_equal(X_64.dtype, np.float64)
            assert_equal(y_64.dtype, np.float64)

            assert_array_almost_equal(Xt_32, Xt_64)
            assert_array_almost_equal(yt_32, yt_64)
            assert_array_almost_equal(X_mean_32, X_mean_64)
            assert_array_almost_equal(y_mean_32, y_mean_64)
            assert_array_almost_equal(X_norm_32, X_norm_64)
Exemplo n.º 22
0
    def fit(self,
            X,
            y,
            groups=None,
            seed=None,
            verbose=False,
            sample_weight=None,
            option=True):
        """Fit data according to the UoI-Lasso algorithm.
		Relevant information (fits, residuals, model performance) is stored within object.
		Thus, nothing is returned by this function.

		Parameters
		----------
		X : np array (2d)
			the design matrix, containing the predictors.
			its shape is assumed to be (number of samples, number of features).

		y : np array (1d)
			the vector of dependent variables.
			its length is assumed to be (number of samples,).

		seed : int
			a seed for the random number generator. this number is relevant
			for the choosing bootstraps and dividing the data into training and test sets.

		verbose : boolean
			a boolean switch indicating whether the fitting should print out its progress.
		"""
        # initialize the seed, if it's provided

        if seed is not None:
            np.random.seed(seed)

        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         y_numeric=True,
                         multi_output=True)

        # preprocess data through centering and normalization
        X, y, X_offset, y_offset, X_scale = _preprocess_data(
            X,
            y,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            copy=self.copy_X)

        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
            raise ValueError("Sample weights must be 1D array or scalar")

        if sample_weight is not None:
            # Sample weight can be implemented via a simple rescaling.
            X, y = _rescale_data(X, y, sample_weight)

        # extract model dimensions from design matrix
        self.n_samples_, self.n_features_ = X.shape
        # create or overwrite arrays to collect final results
        self.coef_ = np.zeros(self.n_features_, dtype=np.float32)

        # group leveling
        if groups is None:
            self.groups_ = np.ones(self.n_samples_)
        else:
            self.groups_ = np.array(groups)

        if verbose:
            print('(1) Loaded data.\n %s samples with %s features.' %
                  (self.n_samples_, self.n_features_))

        self.lambdas = _alpha_grid(X=X,
                                   y=y,
                                   l1_ratio=1.0,
                                   fit_intercept=self.fit_intercept,
                                   eps=1e-3,
                                   n_alphas=self.n_lambdas,
                                   normalize=self.normalize)

        # sweep over the grid of regularization strengths
        estimates_selection, _ = \
         self.lasso_sweep(
          X, y, self.lambdas, self.train_frac_sel, self.n_boots_sel,
          self.use_admm, desc='fine lasso sweep', verbose=verbose
         )

        # perform the intersection step
        self.intersection(estimates_selection)

        ########################
        ### Model Estimation ###
        ########################
        # we'll use the supports obtained in the selection module to calculate
        # bagged OLS estimates over bootstraps

        if verbose:
            print('(3) Beginning model estimation, with %s bootstraps.' %
                  self.n_boots_est)

        # compute number of samples per bootstrap
        n_samples_bootstrap = int(round(self.train_frac_est * self.n_samples_))

        # set up data arrays
        estimates = np.zeros(
            (self.n_boots_est, self.n_lambdas, self.n_features_),
            dtype=np.float32)
        scores = np.zeros((self.n_boots_est, self.n_lambdas), dtype=np.float32)

        # iterate over bootstrap samples
        for bootstrap in trange(self.n_boots_est,
                                desc='Model Estimation',
                                disable=not verbose):

            # extract the bootstrap indices, keeping a fraction of the data available for testing
            train_idx, test_idx = utils.leveled_randomized_ids(
                self.groups_, self.train_frac_est)

            # iterate over the regularization parameters
            for lamb_idx, lamb in enumerate(self.lambdas):
                # extract current support set
                support = self.supports_[lamb_idx]

                # extract response vectors
                y_train = y[train_idx]
                y_test = y[test_idx]

                # if nothing was selected, we won't bother running OLS
                if np.any(support):
                    # get design matrices
                    X_train = X[train_idx][:, support]
                    X_test = X[test_idx][:, support]

                    # compute ols estimate
                    ols = lm.LinearRegression()
                    ols.fit(X_train, y_train)

                    # store the fitted coefficients
                    estimates[bootstrap, lamb_idx, support] = ols.coef_

                    # calculate estimation score
                    if self.estimation_score == 'r2':
                        scores[bootstrap, lamb_idx] = ols.score(X_test, y_test)
                    elif self.estimation_score == 'BIC':
                        y_pred = ols.predict(X_test)
                        n_features = np.count_nonzero(support)
                        scores[bootstrap,
                               lamb_idx] = -utils.BIC(y_true=y_test,
                                                      y_pred=y_pred,
                                                      n_features=n_features)
                    elif self.estimation_score == 'AIC':
                        y_pred = ols.predict(X_test)
                        n_features = np.count_nonzero(support)
                        scores[bootstrap,
                               lamb_idx] = -utils.AIC(y_true=y_test,
                                                      y_pred=y_pred,
                                                      n_features=n_features)
                    elif self.estimation_score == 'AICc':
                        y_pred = ols.predict(X_test)
                        n_features = np.count_nonzero(support)
                        scores[bootstrap,
                               lamb_idx] = -utils.AICc(y_true=y_test,
                                                       y_pred=y_pred,
                                                       n_features=n_features)
                    else:
                        raise ValueError(
                            str(self.estimation_score) +
                            ' is not a valid option.')
                else:
                    if self.estimation_score == 'r2':
                        scores[bootstrap, lamb_idx] = r2_score(
                            y_true=y_test, y_pred=np.zeros(y_test.size))
                    elif self.estimation_score == 'BIC':
                        n_features = 0
                        scores[bootstrap, lamb_idx] = -utils.BIC(
                            y_true=y_test,
                            y_pred=np.zeros(y_test.size),
                            n_features=n_features)
                    elif self.estimation_score == 'AIC':
                        n_features = 0
                        scores[bootstrap, lamb_idx] = -utils.AIC(
                            y_true=y_test,
                            y_pred=np.zeros(y_test.size),
                            n_features=n_features)
                    elif self.estimation_score == 'AICc':
                        n_features = 0
                        scores[bootstrap, lamb_idx] = -utils.AICc(
                            y_true=y_test,
                            y_pred=np.zeros(y_test.size),
                            n_features=n_features)
                    else:
                        raise ValueError(
                            str(self.estimation_score) +
                            ' is not a valid option.')

        if verbose:
            print('(4) Bagging estimates, using bagging option %s.' %
                  self.bagging_options)

        # bagging option 1:
        #	for each bootstrap sample, find the regularization parameter that gave the best results
        if self.bagging_options == 1:
            self.lambda_max_idx = np.argmax(scores, axis=1)
            # extract the estimates over bootstraps from the model with best lambda
            best_estimates = estimates[np.arange(self.n_boots_est),
                                       self.lambda_max_idx, :]
            # take the median across estimates for the final, bagged estimate
            self.coef_ = np.median(best_estimates, axis=0)

        # bagging option 2:
        #	average estimates across bootstraps, and then find the regularization parameter that gives the best results
        elif self.bagging_options == 2:
            mean_scores = np.mean(scores, axis=0)
            self.lambda_max_idx = np.argmax(mean_scores)
            self.coef_ = np.median(estimates[:, self.lambda_max_idx, :], 0)

        else:
            raise ValueError('Bagging option %d is not available.' %
                             self.bagging_options)

        if verbose:
            print("---> UoI Lasso complete.")

        self._set_intercept(X_offset, y_offset, X_scale)

        return self
Exemplo n.º 23
0
def test_deprecation_center_data():
    n_samples = 200
    n_features = 2

    w = 1.0 + rng.rand(n_samples)
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    param_grid = product([True, False], [True, False], [True, False],
                         [None, w])

    for (fit_intercept, normalize, copy, sample_weight) in param_grid:

        XX = X.copy()  # such that we can try copy=False as well

        X1, y1, X1_mean, X1_var, y1_mean = \
            center_data(XX, y, fit_intercept=fit_intercept,
                        normalize=normalize, copy=copy,
                        sample_weight=sample_weight)

        XX = X.copy()

        X2, y2, X2_mean, X2_var, y2_mean = \
            _preprocess_data(XX, y, fit_intercept=fit_intercept,
                             normalize=normalize, copy=copy,
                             sample_weight=sample_weight)

        assert_array_almost_equal(X1, X2)
        assert_array_almost_equal(y1, y2)
        assert_array_almost_equal(X1_mean, X2_mean)
        assert_array_almost_equal(X1_var, X2_var)
        assert_array_almost_equal(y1_mean, y2_mean)

    # Sparse cases
    X = sparse.csr_matrix(X)

    for (fit_intercept, normalize, copy, sample_weight) in param_grid:

        X1, y1, X1_mean, X1_var, y1_mean = \
            center_data(X, y, fit_intercept=fit_intercept, normalize=normalize,
                        copy=copy, sample_weight=sample_weight)

        X2, y2, X2_mean, X2_var, y2_mean = \
            _preprocess_data(X, y, fit_intercept=fit_intercept,
                             normalize=normalize, copy=copy,
                             sample_weight=sample_weight, return_mean=False)

        assert_array_almost_equal(X1.toarray(), X2.toarray())
        assert_array_almost_equal(y1, y2)
        assert_array_almost_equal(X1_mean, X2_mean)
        assert_array_almost_equal(X1_var, X2_var)
        assert_array_almost_equal(y1_mean, y2_mean)

    for (fit_intercept, normalize) in product([True, False], [True, False]):

        X1, y1, X1_mean, X1_var, y1_mean = \
            sparse_center_data(X, y, fit_intercept=fit_intercept,
                               normalize=normalize)

        X2, y2, X2_mean, X2_var, y2_mean = \
            _preprocess_data(X, y, fit_intercept=fit_intercept,
                             normalize=normalize, return_mean=True)

        assert_array_almost_equal(X1.toarray(), X2.toarray())
        assert_array_almost_equal(y1, y2)
        assert_array_almost_equal(X1_mean, X2_mean)
        assert_array_almost_equal(X1_var, X2_var)
        assert_array_almost_equal(y1_mean, y2_mean)
Exemplo n.º 24
0
 def preprocess_data(self, X, y):
     return _preprocess_data(X,
                             y,
                             fit_intercept=self.fit_intercept,
                             normalize=self.normalize,
                             copy=self.copy_X)
Exemplo n.º 25
0
    def fit(self, T, y):
        """
        Ref: Sparse Non-Negative Solution of a Linear System of Equations is Unique

        T: (N x L)
        y: (N x 1)
        max_iter: the max number of iteration. If requested_intermediate_solutions_sizes is None. Return the max_iter-sparse solution.
        requested_intermediate_solutions_sizes: a list of the other returned intermediate solutions than with max_iter (they are returned in a list with same indexes)

        Return the list of intermediate solutions. If the perfect solution is found before the end, the list may not be full.
        """
        # this is copied from sklearn preprocessing hope this works fine but I am a believer
        T, y, T_offset, y_offset, T_scale = _preprocess_data( T, y, fit_intercept=True, normalize=False, copy=False, return_mean=True, check_input=True)

        iter_intermediate_solutions_sizes = iter(self.requested_intermediate_solutions_sizes)

        lst_intermediate_solutions = []
        bool_arr_selected_indexes = np.zeros(T.shape[1], dtype=bool)
        residual = y
        i = 0
        next_solution = next(iter_intermediate_solutions_sizes, None)
        while i < self.max_iter and next_solution != None and not np.isclose(np.linalg.norm(residual), 0):
            # if logger is not None: logger.debug("iter {}".format(i))
            # compute all correlations between atoms and residual
            dot_products = T.T @ residual

            idx_max_dot_product = np.argmax(dot_products)
            # only positively correlated results can be taken
            if dot_products[idx_max_dot_product] <= 0:
                self._logger.warning("No other atoms is positively correlated with the residual. End prematurely with {} atoms.".format(i + 1))
                break

            # selection of atom with max correlation with residual
            bool_arr_selected_indexes[idx_max_dot_product] = True

            tmp_T = T[:, bool_arr_selected_indexes]
            sol = nnls(tmp_T, y)[0]  # non negative least square
            residual = y - tmp_T @ sol
            int_used_atoms = np.sum(sol.astype(bool))
            if  int_used_atoms != i+1:
                self._logger.warning("Atom found but not used. {} < {}".format(int_used_atoms, i+1))

            if i + 1 == next_solution:
                final_vec = np.zeros(T.shape[1])
                final_vec[bool_arr_selected_indexes] = sol  # solution is full of zero but on selected indices
                lst_intermediate_solutions.append(final_vec)
                next_solution = next(iter_intermediate_solutions_sizes, None)

            i += 1

        if len(lst_intermediate_solutions) == 0 and np.isclose(np.linalg.norm(residual), 0):
            final_vec = np.zeros(T.shape[1])
            final_vec[bool_arr_selected_indexes] = sol  # solution is full of zero but on selected indices
            lst_intermediate_solutions.append(final_vec)

        nb_missing_solutions = len(self.requested_intermediate_solutions_sizes) - len(lst_intermediate_solutions)

        if nb_missing_solutions > 0:
            if self.fill_with_final_solution:
                self._logger.warning("nn_omp ended prematurely and found less solution than expected: "
                               "expected {}. found {}".format(len(self.requested_intermediate_solutions_sizes), len(lst_intermediate_solutions)))
                lst_intermediate_solutions.extend([deepcopy(lst_intermediate_solutions[-1]) for _ in range(nb_missing_solutions)])
            else:
                self._logger.warning("nn_omp ended prematurely and found less solution than expected: "
                                     "expected {}. found {}. But fill with the last solution".format(len(self.requested_intermediate_solutions_sizes), len(lst_intermediate_solutions)))

        self.lst_intermediate_solutions = lst_intermediate_solutions
        self._set_intercept(T_offset, y_offset, T_scale)
Exemplo n.º 26
0
def test_deprecation_center_data():
    n_samples = 200
    n_features = 2

    w = 1.0 + rng.rand(n_samples)
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    param_grid = product([True, False], [True, False], [True, False],
                         [None, w])

    for (fit_intercept, normalize, copy, sample_weight) in param_grid:

        XX = X.copy()  # such that we can try copy=False as well

        X1, y1, X1_mean, X1_var, y1_mean = \
            center_data(XX, y, fit_intercept=fit_intercept,
                        normalize=normalize, copy=copy,
                        sample_weight=sample_weight)

        XX = X.copy()

        X2, y2, X2_mean, X2_var, y2_mean = \
            _preprocess_data(XX, y, fit_intercept=fit_intercept,
                             normalize=normalize, copy=copy,
                             sample_weight=sample_weight)

        assert_array_almost_equal(X1, X2)
        assert_array_almost_equal(y1, y2)
        assert_array_almost_equal(X1_mean, X2_mean)
        assert_array_almost_equal(X1_var, X2_var)
        assert_array_almost_equal(y1_mean, y2_mean)

    # Sparse cases
    X = sparse.csr_matrix(X)

    for (fit_intercept, normalize, copy, sample_weight) in param_grid:

        X1, y1, X1_mean, X1_var, y1_mean = \
            center_data(X, y, fit_intercept=fit_intercept, normalize=normalize,
                        copy=copy, sample_weight=sample_weight)

        X2, y2, X2_mean, X2_var, y2_mean = \
            _preprocess_data(X, y, fit_intercept=fit_intercept,
                             normalize=normalize, copy=copy,
                             sample_weight=sample_weight, return_mean=False)

        assert_array_almost_equal(X1.toarray(), X2.toarray())
        assert_array_almost_equal(y1, y2)
        assert_array_almost_equal(X1_mean, X2_mean)
        assert_array_almost_equal(X1_var, X2_var)
        assert_array_almost_equal(y1_mean, y2_mean)

    for (fit_intercept, normalize) in product([True, False], [True, False]):

        X1, y1, X1_mean, X1_var, y1_mean = \
            sparse_center_data(X, y, fit_intercept=fit_intercept,
                               normalize=normalize)

        X2, y2, X2_mean, X2_var, y2_mean = \
            _preprocess_data(X, y, fit_intercept=fit_intercept,
                             normalize=normalize, return_mean=True)

        assert_array_almost_equal(X1.toarray(), X2.toarray())
        assert_array_almost_equal(y1, y2)
        assert_array_almost_equal(X1_mean, X2_mean)
        assert_array_almost_equal(X1_var, X2_var)
        assert_array_almost_equal(y1_mean, y2_mean)