Пример #1
0
def test_preprocess_data_weighted():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    sample_weight = rng.rand(n_samples)
    expected_X_mean = np.average(X, axis=0, weights=sample_weight)
    expected_y_mean = np.average(y, axis=0, weights=sample_weight)

    # XXX: if normalize=True, should we expect a weighted standard deviation?
    #      Currently not weighted, but calculated with respect to weighted mean
    expected_X_norm = (np.sqrt(X.shape[0]) * np.mean(
        (X - expected_X_mean)**2, axis=0)**.5)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         sample_weight=sample_weight)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
Пример #2
0
def test_preprocess_data_multioutput():
    n_samples = 200
    n_features = 3
    n_outputs = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples, n_outputs)
    expected_y_mean = np.mean(y, axis=0)

    args = [X, sparse.csc_matrix(X)]
    for X in args:
        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=False,
                                               normalize=False)
        assert_array_almost_equal(y_mean, np.zeros(n_outputs))
        assert_array_almost_equal(yt, y)

        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=True,
                                               normalize=False)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)

        _, yt, _, y_mean, _ = _preprocess_data(X,
                                               y,
                                               fit_intercept=True,
                                               normalize=True)
        assert_array_almost_equal(y_mean, expected_y_mean)
        assert_array_almost_equal(yt, y - y_mean)
Пример #3
0
def test_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)
    expected_X_mean = np.mean(X, axis=0)
    expected_X_norm = np.std(X, axis=0) * np.sqrt(X.shape[0])
    expected_y_mean = np.mean(y, axis=0)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt, X - expected_X_mean)
    assert_array_almost_equal(yt, y - expected_y_mean)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True)
    assert_array_almost_equal(X_mean, expected_X_mean)
    assert_array_almost_equal(y_mean, expected_y_mean)
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt, (X - expected_X_mean) / expected_X_norm)
    assert_array_almost_equal(yt, y - expected_y_mean)
Пример #4
0
def test_csr_preprocess_data():
    # Test output format of _preprocess_data, when input is csr
    X, y = make_regression()
    X[X < 2.5] = 0.0
    csr = sparse.csr_matrix(X)
    csr_, y, _, _, _ = _preprocess_data(csr, y, True)
    assert csr_.getformat() == 'csr'
Пример #5
0
def test_sparse_preprocess_data_with_return_mean():
    n_samples = 200
    n_features = 2
    # random_state not supported yet in sparse.rand
    X = sparse.rand(n_samples, n_features, density=.5)  # , random_state=rng
    X = X.tolil()
    y = rng.rand(n_samples)
    XA = X.toarray()
    expected_X_norm = np.std(XA, axis=0) * np.sqrt(X.shape[0])

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=False, normalize=False,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.zeros(n_features))
    assert_array_almost_equal(y_mean, 0)
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y)

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=False,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_norm, np.ones(n_features))
    assert_array_almost_equal(Xt.A, XA)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))

    Xt, yt, X_mean, y_mean, X_norm = \
        _preprocess_data(X, y, fit_intercept=True, normalize=True,
                         return_mean=True)
    assert_array_almost_equal(X_mean, np.mean(XA, axis=0))
    assert_array_almost_equal(y_mean, np.mean(y, axis=0))
    assert_array_almost_equal(X_norm, expected_X_norm)
    assert_array_almost_equal(Xt.A, XA / expected_X_norm)
    assert_array_almost_equal(yt, y - np.mean(y, axis=0))
Пример #6
0
def test_preprocess_copy_data_no_checks(is_sparse, to_copy):
    X, y = make_regression()
    X[X < 2.5] = 0.0

    if is_sparse:
        X = sparse.csr_matrix(X)

    X_, y_, _, _, _ = _preprocess_data(X,
                                       y,
                                       True,
                                       copy=to_copy,
                                       check_input=False)

    if to_copy and is_sparse:
        assert not np.may_share_memory(X_.data, X.data)
    elif to_copy:
        assert not np.may_share_memory(X_, X)
    elif is_sparse:
        assert np.may_share_memory(X_.data, X.data)
    else:
        assert np.may_share_memory(X_, X)
Пример #7
0
def test_dtype_preprocess_data():
    n_samples = 200
    n_features = 2
    X = rng.rand(n_samples, n_features)
    y = rng.rand(n_samples)

    X_32 = np.asarray(X, dtype=np.float32)
    y_32 = np.asarray(y, dtype=np.float32)
    X_64 = np.asarray(X, dtype=np.float64)
    y_64 = np.asarray(y, dtype=np.float64)

    for fit_intercept in [True, False]:
        for normalize in [True, False]:

            Xt_32, yt_32, X_mean_32, y_mean_32, X_norm_32 = _preprocess_data(
                X_32,
                y_32,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True)

            Xt_64, yt_64, X_mean_64, y_mean_64, X_norm_64 = _preprocess_data(
                X_64,
                y_64,
                fit_intercept=fit_intercept,
                normalize=normalize,
                return_mean=True)

            Xt_3264, yt_3264, X_mean_3264, y_mean_3264, X_norm_3264 = (
                _preprocess_data(X_32,
                                 y_64,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize,
                                 return_mean=True))

            Xt_6432, yt_6432, X_mean_6432, y_mean_6432, X_norm_6432 = (
                _preprocess_data(X_64,
                                 y_32,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize,
                                 return_mean=True))

            assert Xt_32.dtype == np.float32
            assert yt_32.dtype == np.float32
            assert X_mean_32.dtype == np.float32
            assert y_mean_32.dtype == np.float32
            assert X_norm_32.dtype == np.float32

            assert Xt_64.dtype == np.float64
            assert yt_64.dtype == np.float64
            assert X_mean_64.dtype == np.float64
            assert y_mean_64.dtype == np.float64
            assert X_norm_64.dtype == np.float64

            assert Xt_3264.dtype == np.float32
            assert yt_3264.dtype == np.float32
            assert X_mean_3264.dtype == np.float32
            assert y_mean_3264.dtype == np.float32
            assert X_norm_3264.dtype == np.float32

            assert Xt_6432.dtype == np.float64
            assert yt_6432.dtype == np.float64
            assert X_mean_6432.dtype == np.float64
            assert y_mean_6432.dtype == np.float64
            assert X_norm_6432.dtype == np.float64

            assert X_32.dtype == np.float32
            assert y_32.dtype == np.float32
            assert X_64.dtype == np.float64
            assert y_64.dtype == np.float64

            assert_array_almost_equal(Xt_32, Xt_64)
            assert_array_almost_equal(yt_32, yt_64)
            assert_array_almost_equal(X_mean_32, X_mean_64)
            assert_array_almost_equal(y_mean_32, y_mean_64)
            assert_array_almost_equal(X_norm_32, X_norm_64)