def make_sparse_low_rank(n_dim_obs=3,
                         n_dim_lat=2,
                         T=10,
                         epsilon=1e-3,
                         n_samples=50,
                         **kwargs):
    """Generate dataset (new new version)."""
    from sklearn.datasets import make_sparse_spd_matrix, make_low_rank_matrix

    K = make_sparse_spd_matrix(n_dim_obs)
    L = make_low_rank_matrix(n_dim_obs, n_dim_obs, effective_rank=n_dim_lat)

    Ks = [K]
    Ls = [L]
    Kobs = [K - L]

    for i in range(1, T):
        K = K + make_sparse_spd_matrix(n_dim_obs)
        L = L + make_low_rank_matrix(
            n_dim_obs, n_dim_obs, effective_rank=n_dim_lat)

        # assert is_pos_def(K - L)
        # assert is_pos_semidef(L)

        Ks.append(K)
        Ls.append(L)
        Kobs.append(K - L)

    return Ks, Kobs, Ls
Exemplo n.º 2
0
def test_singular_values(svd_solver):
    # Check that the IncrementalPCA output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 100

    X = datasets.make_low_rank_matrix(
        n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng
    )
    X = da.from_array(X, chunks=[200, -1])

    pca = PCA(n_components=10, svd_solver=svd_solver, random_state=rng).fit(X)
    ipca = IncrementalPCA(n_components=10, batch_size=100, svd_solver=svd_solver).fit(X)
    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    X_ipca = ipca.transform(X)
    assert_array_almost_equal(
        np.sum(pca.singular_values_ ** 2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12
    )
    assert_array_almost_equal(
        np.sum(ipca.singular_values_ ** 2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2
    )

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(
        pca.singular_values_, np.sqrt(np.sum(X_pca ** 2.0, axis=0)), 12
    )
    assert_array_almost_equal(
        ipca.singular_values_, np.sqrt(np.sum(X_ipca ** 2.0, axis=0)), 2
    )

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = datasets.make_low_rank_matrix(
        n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng
    )
    X = da.from_array(X, chunks=[4, -1])

    pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng)
    ipca = IncrementalPCA(n_components=3, batch_size=100, svd_solver=svd_solver)

    X_pca = pca.fit_transform(X)
    X_pca /= np.sqrt(np.sum(X_pca ** 2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat = np.dot(X_pca, pca.components_)
    pca.fit(X_hat)
    X_hat = da.from_array(X_hat, chunks=(4, -1))
    ipca.fit(X_hat)
    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_singular_values():
    # Check that the IncrementalPCA output has the correct singular values

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 100

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=10, random_state=rng)

    pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X)
    ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X)
    assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)

    # Compare to the Frobenius norm
    X_pca = pca.transform(X)
    X_ipca = ipca.transform(X)
    assert_array_almost_equal(np.sum(pca.singular_values_**2.0),
                              np.linalg.norm(X_pca, "fro")**2.0, 12)
    assert_array_almost_equal(np.sum(ipca.singular_values_**2.0),
                              np.linalg.norm(X_ipca, "fro")**2.0, 2)

    # Compare to the 2-norms of the score vectors
    assert_array_almost_equal(pca.singular_values_,
                              np.sqrt(np.sum(X_pca**2.0, axis=0)), 12)
    assert_array_almost_equal(ipca.singular_values_,
                              np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2)

    # Set the singular values and see what we get back
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 110

    X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
                                      effective_rank=3, random_state=rng)

    pca = PCA(n_components=3, svd_solver='full', random_state=rng)
    ipca = IncrementalPCA(n_components=3, batch_size=100)

    X_pca = pca.fit_transform(X)
    X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
    X_pca[:, 0] *= 3.142
    X_pca[:, 1] *= 2.718

    X_hat = np.dot(X_pca, pca.components_)
    pca.fit(X_hat)
    ipca.fit(X_hat)
    assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
    assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
Exemplo n.º 4
0
def test_shaping_3_values(eng):

    svd = lambda x: SVD(k=3, method='direct', seed=0).fit(x)

    # baseline: ndarray (local) or BoltArray (spark)
    x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0)
    x = series.fromarray(x, engine=eng).values
    u, s, v = svd(x)

    # simple series
    x1 = series.fromarray(x)
    u1, s1, v1 = svd(x1)
    assert allclose(u, u1)
    assert allclose(s, s1)
    assert allclose(v, v1)

    # series with multiple dimensions
    x1 = series.fromarray(x.reshape(2, 5, 10))
    u1, s1, v1 = svd(x1)
    u1 = u1.reshape(10, 3)
    assert allclose(u, u1)
    assert allclose(s, s1)
    assert allclose(v, v1)

    # images (must have multiple dimensions)
    x1 = images.fromarray(x.reshape(10, 2, 5))
    u1, s1, v1 = svd(x1)
    v1 = v1.reshape(3, 10)
    assert allclose(u, u1)
    assert allclose(s, s1)
    assert allclose(v, v1)
Exemplo n.º 5
0
def test_shaping_2_values(eng):

    pca = lambda x: PCA(k=3, svd_method='direct', seed=0).fit(x)

    # baseline: ndarray (local) or BoltArray (spark)
    x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0)
    x = series.fromarray(x, engine=eng).values
    t, w = pca(x)

    # simple series
    x1 = series.fromarray(x)
    t1, w1 = pca(x1)
    assert allclose(t, t1)
    assert allclose(w, w1)

    # series with multiple dimensions
    x1 = series.fromarray(x.reshape(2, 5, 10))
    t1, w1 = pca(x1)
    t1 = t1.reshape(10, 3)
    assert allclose(t, t1)
    assert allclose(w, w1)

    # images (must have multiple dimensions)
    x1 = images.fromarray(x.reshape(10, 2, 5))
    t1, w1 = pca(x1)
    w1 = w1.reshape(3, 10)
    assert allclose(t, t1)
    assert allclose(w, w1)
Exemplo n.º 6
0
def test_randomized_svd_infinite_rank():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=1.0,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)
    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate method
        # without the iterated power method
        _, sa, _ = randomized_svd(X, k, n_iter=0,
                                  power_iteration_normalizer=normalizer)

        # the approximation does not tolerate the noise:
        assert np.abs(s[:k] - sa).max() > 0.1

        # compute the singular values of X using the fast approximate method
        # with iterated power method
        _, sap, _ = randomized_svd(X, k, n_iter=5,
                                   power_iteration_normalizer=normalizer)

        # the iterated power method is still managing to get most of the
        # structure at the requested rank
        assert_almost_equal(s[:k], sap, decimal=3)
Exemplo n.º 7
0
def test_randomized_svd_transpose_consistency():
    # Check that transposing the design matrix has limited impact
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False,
                                random_state=0)
    U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True,
                                random_state=0)
    U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto',
                                random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
def test_shaping_2_values(eng):

    pca = lambda x: PCA(k=3, svd_method='direct', seed=0).fit(x)

    # baseline: ndarray (local) or BoltArray (spark)
    x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0)
    x = series.fromarray(x, engine=eng).values
    t, w = pca(x)

    # simple series
    x1 = series.fromarray(x)
    t1, w1 = pca(x1)
    assert allclose(t, t1)
    assert allclose(w, w1)

    # series with multiple dimensions
    x1 = series.fromarray(x.reshape(2, 5, 10))
    t1, w1 = pca(x1)
    t1 = t1.reshape(10, 3)
    assert allclose(t, t1)
    assert allclose(w, w1)

    # images (must have multiple dimensions)
    x1 = images.fromarray(x.reshape(10, 2, 5))
    t1, w1 = pca(x1)
    w1 = w1.reshape(3, 10)
    assert allclose(t, t1)
    assert allclose(w, w1)
def test_shaping_3_values(eng):

    svd= lambda x: SVD(k=3, method='direct', seed=0).fit(x)

    # baseline: ndarray (local) or BoltArray (spark)
    x = make_low_rank_matrix(n_samples=10, n_features=10, random_state=0)
    x = series.fromarray(x, engine=eng).values
    u, s, v = svd(x)

    # simple series
    x1 = series.fromarray(x)
    u1, s1, v1 = svd(x1)
    assert allclose(u, u1)
    assert allclose(s, s1)
    assert allclose(v, v1)

    # series with multiple dimensions
    x1 = series.fromarray(x.reshape(2, 5, 10))
    u1, s1, v1 = svd(x1)
    u1 = u1.reshape(10, 3)
    assert allclose(u, u1)
    assert allclose(s, s1)
    assert allclose(v, v1)

    # images (must have multiple dimensions)
    x1 = images.fromarray(x.reshape(10, 2, 5))
    u1, s1, v1 = svd(x1)
    v1 = v1.reshape(3, 10)
    assert allclose(u, u1)
    assert allclose(s, s1)
    assert allclose(v, v1)
Exemplo n.º 10
0
def test_whitening(svd_solver):
    # Test that PCA and IncrementalPCA transforms match to sign flip.
    X = datasets.make_low_rank_matrix(1000,
                                      10,
                                      tail_strength=0.0,
                                      effective_rank=2,
                                      random_state=1999)
    X = da.from_array(X, chunks=[200, -1])
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 9]:
        pca = PCA(whiten=True, n_components=nc,
                  svd_solver=svd_solver).fit(X.compute())
        ipca = IncrementalPCA(whiten=True,
                              n_components=nc,
                              batch_size=250,
                              svd_solver=svd_solver).fit(X)

        Xt_pca = pca.transform(X)
        Xt_ipca = ipca.transform(X)
        assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
        Xinv_pca = pca.inverse_transform(Xt_pca)
        assert_almost_equal(X.compute(), Xinv_ipca, decimal=prec)
        assert_almost_equal(X.compute(), Xinv_pca, decimal=prec)
        assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
Exemplo n.º 11
0
def test_randomized_svd_power_iteration_normalizer():
    # randomized_svd with power_iteration_normalized='none' diverges for
    # large number of power iterations on this dataset
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
    X += 3 * rng.randint(0, 2, size=X.shape)
    n_components = 50

    # Check that it diverges with many (non-normalized) power iterations
    U, s, Vt = randomized_svd(X, n_components, n_iter=2,
                              power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(Vt))
    error_2 = linalg.norm(A, ord='fro')
    U, s, Vt = randomized_svd(X, n_components, n_iter=20,
                              power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(Vt))
    error_20 = linalg.norm(A, ord='fro')
    assert np.abs(error_2 - error_20) > 100

    for normalizer in ['LU', 'QR', 'auto']:
        U, s, Vt = randomized_svd(X, n_components, n_iter=2,
                                  power_iteration_normalizer=normalizer,
                                  random_state=0)
        A = X - U.dot(np.diag(s).dot(Vt))
        error_2 = linalg.norm(A, ord='fro')

        for i in [5, 10, 50]:
            U, s, Vt = randomized_svd(X, n_components, n_iter=i,
                                      power_iteration_normalizer=normalizer,
                                      random_state=0)
            A = X - U.dot(np.diag(s).dot(Vt))
            error = linalg.norm(A, ord='fro')
            assert 15 > np.abs(error_2 - error)
Exemplo n.º 12
0
def test_poisson():
    # For Poisson distributed target, Poisson loss should give better results
    # than least squares measured in Poisson deviance as metric.
    rng = np.random.RandomState(42)
    n_train, n_test, n_features = 500, 100, 100
    X = make_low_rank_matrix(n_samples=n_train + n_test,
                             n_features=n_features,
                             random_state=rng)
    # We create a log-linear Poisson model and downscale coef as it will get
    # exponentiated.
    coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
    y = rng.poisson(lam=np.exp(X @ coef))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=n_test,
                                                        random_state=rng)
    gbdt_pois = HistGradientBoostingRegressor(loss="poisson", random_state=rng)
    gbdt_ls = HistGradientBoostingRegressor(loss="squared_error",
                                            random_state=rng)
    gbdt_pois.fit(X_train, y_train)
    gbdt_ls.fit(X_train, y_train)
    dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)

    for X, y in [(X_train, y_train), (X_test, y_test)]:
        metric_pois = mean_poisson_deviance(y, gbdt_pois.predict(X))
        # squared_error might produce non-positive predictions => clip
        metric_ls = mean_poisson_deviance(
            y, np.clip(gbdt_ls.predict(X), 1e-15, None))
        metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
        assert metric_pois < metric_ls
        assert metric_pois < metric_dummy
Exemplo n.º 13
0
def test_randomized_svd_low_rank_with_noise():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.1,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate
        # method without the iterated power method
        _, sa, _ = randomized_svd(X, k, n_iter=0,
                                  power_iteration_normalizer=normalizer,
                                  random_state=0)

        # the approximation does not tolerate the noise:
        assert np.abs(s[:k] - sa).max() > 0.01

        # compute the singular values of X using the fast approximate
        # method with iterated power method
        _, sap, _ = randomized_svd(X, k,
                                   power_iteration_normalizer=normalizer,
                                   random_state=0)

        # the iterated power method is helping getting rid of the noise:
        assert_almost_equal(s[:k], sap, decimal=3)
def test_make_low_rank_matrix():
    X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5,
                             tail_strength=0.01, random_state=0)

    assert_equal(X.shape, (50, 25), "X shape mismatch")

    from numpy.linalg import svd
    u, s, v = svd(X)
    assert_less(sum(s) - 5, 0.1, "X rank is not approximately 5")
def test_make_low_rank_matrix():
    X = make_low_rank_matrix(n_samples=50, n_features=25, effective_rank=5,
                             tail_strength=0.01, random_state=0)

    assert_equal(X.shape, (50, 25), "X shape mismatch")

    from numpy.linalg import svd
    u, s, v = svd(X)
    assert_less(sum(s) - 5, 0.1, "X rank is not approximately 5")
Exemplo n.º 16
0
def make_regression(n_samples=100, n_features=50, effective_rank=10, tail_strength=0.5):
    """Make a synthetic regression problem using low rank matrices with an eigenspectrum of some
    effective rank with a tail. Splits matrix to produce train and test set.
    """
    X0 = make_low_rank_matrix(n_samples=2 * n_samples, n_features=n_features + 1,
                              effective_rank=effective_rank, tail_strength=tail_strength)
    X0 -= np.sum(X0, axis=0)
    X_train, X_test = X0[:n_samples, :n_features], X0[n_samples:, :n_features]
    y_train, y_test = X0[:n_samples, n_features], X0[n_samples:, n_features]
    return X_train, y_train, X_test, y_test
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == "lfw_people":
        X = fetch_lfw_people().data
    elif dataset_name == "20newsgroups":
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == "olivetti_faces":
        X = fetch_olivetti_faces().data
    elif dataset_name == "rcv1":
        X = fetch_rcv1().data
    elif dataset_name == "CIFAR":
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == "SVHN":
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"]
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == "low rank matrix":
        X = make_low_rank_matrix(
            n_samples=500,
            n_features=int(1e4),
            effective_rank=100,
            tail_strength=0.5,
            random_state=random_state,
        )
    elif dataset_name == "uncorrelated matrix":
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == "big sparse matrix":
        sparsity = int(1e6)
        size = int(1e6)
        small_size = int(1e4)
        data = np.random.normal(0, 1, int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name, parser="auto").data
    return X
Exemplo n.º 18
0
def test_fastica_eigh_low_rank_warning(global_random_seed):
    """Test FastICA eigh solver raises warning for low-rank data."""
    rng = np.random.RandomState(global_random_seed)
    X = make_low_rank_matrix(n_samples=10,
                             n_features=10,
                             random_state=rng,
                             effective_rank=2)
    ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh")
    msg = "There are some small singular values"
    with pytest.warns(UserWarning, match=msg):
        ica.fit(X)
Exemplo n.º 19
0
def random_X_y_coef(linear_model_loss,
                    n_samples,
                    n_features,
                    coef_bound=(-2, 2),
                    seed=42):
    """Random generate y, X and coef in valid range."""
    rng = np.random.RandomState(seed)
    n_dof = n_features + linear_model_loss.fit_intercept
    X = make_low_rank_matrix(
        n_samples=n_samples,
        n_features=n_features,
        random_state=rng,
    )

    if linear_model_loss.base_loss.is_multiclass:
        n_classes = linear_model_loss.base_loss.n_classes
        coef = np.empty((n_classes, n_dof))
        coef.flat[:] = rng.uniform(
            low=coef_bound[0],
            high=coef_bound[1],
            size=n_classes * n_dof,
        )
        if linear_model_loss.fit_intercept:
            raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
        else:
            raw_prediction = X @ coef.T
        proba = linear_model_loss.base_loss.link.inverse(raw_prediction)

        # y = rng.choice(np.arange(n_classes), p=proba) does not work.
        # See https://stackoverflow.com/a/34190035/16761084
        def choice_vectorized(items, p):
            s = p.cumsum(axis=1)
            r = rng.rand(p.shape[0])[:, None]
            k = (s < r).sum(axis=1)
            return items[k]

        y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
    else:
        coef = np.empty((n_dof, ))
        coef.flat[:] = rng.uniform(
            low=coef_bound[0],
            high=coef_bound[1],
            size=n_dof,
        )
        if linear_model_loss.fit_intercept:
            raw_prediction = X @ coef[:-1] + coef[-1]
        else:
            raw_prediction = X @ coef
        y = linear_model_loss.base_loss.link.inverse(
            raw_prediction + rng.uniform(low=-1, high=1, size=n_samples))

    return X, y, coef
Exemplo n.º 20
0
def test_randomized_svd_sparse_warnings():
    # randomized_svd throws a warning for lil and dok matrix
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
    n_components = 5
    for cls in (sparse.lil_matrix, sparse.dok_matrix):
        X = cls(X)
        assert_warns_message(
            sparse.SparseEfficiencyWarning,
            "Calculating SVD of a {} is expensive. "
            "csr_matrix is more efficient.".format(cls.__name__),
            randomized_svd, X, n_components, n_iter=1,
            power_iteration_normalizer='none')
Exemplo n.º 21
0
def test_svd(eng):
    x_local = make_low_rank_matrix(n_samples=10, n_features=50, random_state=0)
    x = fromarray(x_local.reshape(10, 10, 5), engine=eng)
    x.cache()
    x.count()
    u1, s1, v1 = randomized_svd(x_local, n_components=2, random_state=0)

    u2, v2, s2 = getSVD(x, k=2, getComponents=True, getS=True)
    assert u1.shape == u2.shape
    assert s1.shape == s2.shape
    assert v1.shape == (2, 50)
    assert v2.shape == (2, 10, 5)

    u2, v2, s2 = getSVD(x,
                        k=2,
                        getComponents=True,
                        getS=True,
                        normalization='nanmean')
    assert u1.shape == u2.shape
    assert s1.shape == s2.shape
    assert v1.shape == (2, 50)
    assert v2.shape == (2, 10, 5)

    u2, v2, s2 = getSVD(x,
                        k=2,
                        getComponents=True,
                        getS=True,
                        normalization='zscore')
    assert u1.shape == u2.shape
    assert s1.shape == s2.shape
    assert v1.shape == (2, 50)
    assert v2.shape == (2, 10, 5)

    u2, v2, s2 = getSVD(x,
                        k=2,
                        getComponents=True,
                        getS=True,
                        normalization=None)
    assert u1.shape == u2.shape
    assert s1.shape == s2.shape
    assert v1.shape == (2, 50)
    assert v2.shape == (2, 10, 5)

    with pytest.raises(ValueError) as ex:
        u2, v2, s2 = getSVD(x,
                            k=2,
                            getComponents=True,
                            getS=True,
                            normalization='error')
    assert 'Normalization should be one of' in str(ex.value)
Exemplo n.º 22
0
def test_explained_variances():
    # Test that PCA and IncrementalPCA calculations match
    X = datasets.make_low_rank_matrix(1000, 100, tail_strength=0.,
                                      effective_rank=10, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 99]:
        pca = PCA(n_components=nc).fit(X)
        ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X)
        assert_almost_equal(pca.explained_variance_, ipca.explained_variance_,
                            decimal=prec)
        assert_almost_equal(pca.explained_variance_ratio_,
                            ipca.explained_variance_ratio_, decimal=prec)
        assert_almost_equal(pca.noise_variance_, ipca.noise_variance_,
                            decimal=prec)
Exemplo n.º 23
0
def test_explained_variances():
    """Test that PCA and IncrementalPCA calculations match"""
    X = datasets.make_low_rank_matrix(1000, 100, tail_strength=0.,
                                      effective_rank=10, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 99]:
        pca = PCA(n_components=nc).fit(X)
        ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X)
        assert_almost_equal(pca.explained_variance_, ipca.explained_variance_,
                            decimal=prec)
        assert_almost_equal(pca.explained_variance_ratio_,
                            ipca.explained_variance_ratio_, decimal=prec)
        assert_almost_equal(pca.noise_variance_, ipca.noise_variance_,
                            decimal=prec)
Exemplo n.º 24
0
def test_svd(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.utils.extmath import randomized_svd
    u1, s1, v1 = randomized_svd(x.toarray(), n_components=2, random_state=0)

    u2, s2, v2 = SVD(k=2, method='direct').fit(x)
    assert allclose_sign(u1, u2)
    assert allclose(s1, s2)
    assert allclose_sign(v1.T, v2.T)

    u2, s2, v2 = SVD(k=2, method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(u1, u2, atol=tol)
    assert allclose(s1, s2, atol=tol)
    assert allclose_sign(v1.T, v2.T, atol=tol)
def test_pca(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.decomposition import PCA as skPCA
    pca = skPCA(n_components=2)
    t1 = pca.fit_transform(x.toarray())
    w1_T = pca.components_

    t2, w2_T = PCA(k=2, svd_method='direct').fit(x)
    assert allclose_sign(w1_T.T, w2_T.T)
    assert allclose_sign(t1, t2)

    t2, w2_T = PCA(k=2, svd_method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(w1_T.T, w2_T.T, atol=tol)
    assert allclose_sign(t1, t2, atol=tol)
Exemplo n.º 26
0
def test_pca(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.decomposition import PCA as skPCA
    pca = skPCA(n_components=2)
    t1 = pca.fit_transform(x.toarray())
    w1_T = pca.components_

    t2, w2_T = PCA(k=2, svd_method='direct').fit(x)
    assert allclose_sign(w1_T.T, w2_T.T)
    assert allclose_sign(t1, t2)

    t2, w2_T = PCA(k=2, svd_method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(w1_T.T, w2_T.T, atol=tol)
    assert allclose_sign(t1, t2, atol=tol)
def test_svd(eng):
    x = make_low_rank_matrix(n_samples=10, n_features=5, random_state=0)
    x = fromarray(x, engine=eng)

    from sklearn.utils.extmath import randomized_svd
    u1, s1, v1 = randomized_svd(x.toarray(), n_components=2,  random_state=0)

    u2, s2, v2 = SVD(k=2, method='direct').fit(x)
    assert allclose_sign(u1, u2)
    assert allclose(s1, s2)
    assert allclose_sign(v1.T, v2.T)

    u2, s2, v2 = SVD(k=2, method='em', max_iter=100, seed=0).fit(x)
    tol = 1e-1
    assert allclose_sign(u1, u2, atol=tol)
    assert allclose(s1, s2, atol=tol)
    assert allclose_sign(v1.T, v2.T, atol=tol)
def bench_b(power_list):

    n_samples, n_features = 1000, 10000
    data_params = {
        "n_samples": n_samples,
        "n_features": n_features,
        "tail_strength": 0.7,
        "random_state": random_state,
    }
    dataset_name = "low rank matrix %d x %d" % (n_samples, n_features)
    ranks = [10, 50, 100]

    if enable_spectral_norm:
        all_spectral = defaultdict(list)
    all_frobenius = defaultdict(list)
    for rank in ranks:
        X = make_low_rank_matrix(effective_rank=rank, **data_params)
        if enable_spectral_norm:
            X_spectral_norm = norm_diff(X, norm=2, msg=False, random_state=0)
        X_fro_norm = norm_diff(X, norm="fro", msg=False)

        for n_comp in [int(rank / 2), rank, rank * 2]:
            label = "rank=%d, n_comp=%d" % (rank, n_comp)
            print(label)
            for pi in power_list:
                U, s, V, _ = svd_timing(
                    X,
                    n_comp,
                    n_iter=pi,
                    n_oversamples=2,
                    power_iteration_normalizer="LU",
                )
                if enable_spectral_norm:
                    A = U.dot(np.diag(s).dot(V))
                    all_spectral[label].append(
                        norm_diff(X - A, norm=2, random_state=0) /
                        X_spectral_norm)
                f = scalable_frobenius_norm_discrepancy(X, U, s, V)
                all_frobenius[label].append(f / X_fro_norm)

    if enable_spectral_norm:
        title = "%s: spectral norm diff vs n power iteration" % (dataset_name)
        plot_power_iter_vs_s(power_iter, all_spectral, title)
    title = "%s: Frobenius norm diff vs n power iteration" % (dataset_name)
    plot_power_iter_vs_s(power_iter, all_frobenius, title)
Exemplo n.º 29
0
def test_whitening():
    """Test that PCA and IncrementalPCA transforms match to sign flip."""
    X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.,
                                      effective_rank=2, random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 9]:
        pca = PCA(whiten=True, n_components=nc).fit(X)
        ipca = IncrementalPCA(whiten=True, n_components=nc,
                              batch_size=250).fit(X)

        Xt_pca = pca.transform(X)
        Xt_ipca = ipca.transform(X)
        assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
        Xinv_pca = pca.inverse_transform(Xt_pca)
        assert_almost_equal(X, Xinv_ipca, decimal=prec)
        assert_almost_equal(X, Xinv_pca, decimal=prec)
        assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
Exemplo n.º 30
0
    def fetch_low_rank_matrix(self, effective_rank=None, tail_strength=None):
        """
        Generates synthetic data using sklearn make_low_rank with self.n,self.d but with 
        variable effective_rank and tail_strength if need be.
        """
        if effective_rank == None:
            eff_rank = self.effective_rank
        else:
            eff_rank = effective_rank

        if tail_strength == None:
            t_strength = self.tail_strength
        else:
            t_strength = tail_strength

        X = make_low_rank_matrix(n_samples=self.n,
                                 n_features=self.d,
                                 effective_rank=eff_rank,
                                 tail_strength=t_strength,
                                 random_state=self.rng)
        return X
Exemplo n.º 31
0
def compute_bench(samples_range, features_range, n_iter=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('====================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('====================')
            X = make_low_rank_matrix(n_samples,
                                     n_features,
                                     effective_rank=rank,
                                     tail_strength=0.2)

            gc.collect()
            print("benchmarking scipy svd: ")
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=0")
            tstart = time()
            randomized_svd(X, rank, n_iter=0)
            results['scikit-learn randomized_svd (n_iter=0)'].append(time() -
                                                                     tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=%d " %
                  n_iter)
            tstart = time()
            randomized_svd(X, rank, n_iter=n_iter)
            results['scikit-learn randomized_svd (n_iter=%d)' %
                    n_iter].append(time() - tstart)

    return results
Exemplo n.º 32
0
def test_whitening():
    """Test that PCA and IncrementalPCA transforms match to sign flip."""
    X = datasets.make_low_rank_matrix(1000,
                                      10,
                                      tail_strength=0.,
                                      effective_rank=2,
                                      random_state=1999)
    prec = 3
    n_samples, n_features = X.shape
    for nc in [None, 9]:
        pca = PCA(whiten=True, n_components=nc).fit(X)
        ipca = IncrementalPCA(whiten=True, n_components=nc,
                              batch_size=250).fit(X)

        Xt_pca = pca.transform(X)
        Xt_ipca = ipca.transform(X)
        assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
        Xinv_ipca = ipca.inverse_transform(Xt_ipca)
        Xinv_pca = pca.inverse_transform(Xt_pca)
        assert_almost_equal(X, Xinv_ipca, decimal=prec)
        assert_almost_equal(X, Xinv_pca, decimal=prec)
        assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
Exemplo n.º 33
0
def check_randomized_svd_low_rank(dtype):
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10
    decimal = 5 if dtype == np.float32 else 7
    dtype = np.dtype(dtype)

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.0,
                             random_state=0).astype(dtype, copy=False)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # Convert the singular values to the specific dtype
    U = U.astype(dtype, copy=False)
    s = s.astype(dtype, copy=False)
    V = V.astype(dtype, copy=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = randomized_svd(X,
                                    k,
                                    power_iteration_normalizer=normalizer,
                                    random_state=0)

        # If the input dtype is float, then the output dtype is float of the
        # same bit size (f32 is not upcast to f64)
        # But if the input dtype is int, the output dtype is float64
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype == np.float64
            assert sa.dtype == np.float64
            assert Va.dtype == np.float64

        assert Ua.shape == (n_samples, k)
        assert sa.shape == (k, )
        assert Va.shape == (k, n_features)

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa, decimal=decimal)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]),
                            np.dot(Ua, Va),
                            decimal=decimal)

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype.kind == 'f'
            assert sa.dtype.kind == 'f'
            assert Va.dtype.kind == 'f'

        assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
from sklearn import datasets
import matplotlib.pyplot as plt

# make_low_rank_matrix data
X = datasets.make_low_rank_matrix(n_samples=100,n_features=2,effective_rank=2,tail_strength=0.5,random_state=None)
print(X)

plt.scatter(X[:,0],X[:,1])
plt.show()
Exemplo n.º 35
0
def glm_dataset(global_random_seed, request):
    """Dataset with GLM solutions, well conditioned X.

    This is inspired by ols_ridge_dataset in test_ridge.py.

    The construction is based on the SVD decomposition of X = U S V'.

    Parameters
    ----------
    type : {"long", "wide"}
        If "long", then n_samples > n_features.
        If "wide", then n_features > n_samples.
    model : a GLM model

    For "wide", we return the minimum norm solution:

        min ||w||_2 subject to w = argmin deviance(X, y, w)

    Note that the deviance is always minimized if y = inverse_link(X w) is possible to
    achieve, which it is in the wide data case. Therefore, we can construct the
    solution with minimum norm like (wide) OLS:

        min ||w||_2 subject to link(y) = raw_prediction = X w

    Returns
    -------
    model : GLM model
    X : ndarray
        Last column of 1, i.e. intercept.
    y : ndarray
    coef_unpenalized : ndarray
        Minimum norm solutions, i.e. min sum(loss(w)) (with mininum ||w||_2 in
        case of ambiguity)
        Last coefficient is intercept.
    coef_penalized : ndarray
        GLM solution with alpha=l2_reg_strength=1, i.e.
        min 1/n * sum(loss) + ||w[:-1]||_2^2.
        Last coefficient is intercept.
    l2_reg_strength : float
        Always equal 1.
    """
    data_type, model = request.param
    # Make larger dim more than double as big as the smaller one.
    # This helps when constructing singular matrices like (X, X).
    if data_type == "long":
        n_samples, n_features = 12, 4
    else:
        n_samples, n_features = 4, 12
    k = min(n_samples, n_features)
    rng = np.random.RandomState(global_random_seed)
    X = make_low_rank_matrix(
        n_samples=n_samples,
        n_features=n_features,
        effective_rank=k,
        tail_strength=0.1,
        random_state=rng,
    )
    X[:, -1] = 1  # last columns acts as intercept
    U, s, Vt = linalg.svd(X, full_matrices=False)
    assert np.all(s > 1e-3)  # to be sure
    assert np.max(s) / np.min(s) < 100  # condition number of X

    if data_type == "long":
        coef_unpenalized = rng.uniform(low=1, high=3, size=n_features)
        coef_unpenalized *= rng.choice([-1, 1], size=n_features)
        raw_prediction = X @ coef_unpenalized
    else:
        raw_prediction = rng.uniform(low=-3, high=3, size=n_samples)
        # minimum norm solution min ||w||_2 such that raw_prediction = X w:
        # w = X'(XX')^-1 raw_prediction = V s^-1 U' raw_prediction
        coef_unpenalized = Vt.T @ np.diag(1 / s) @ U.T @ raw_prediction

    linear_loss = LinearModelLoss(base_loss=model._get_loss(),
                                  fit_intercept=True)
    sw = np.full(shape=n_samples, fill_value=1 / n_samples)
    y = linear_loss.base_loss.link.inverse(raw_prediction)

    # Add penalty l2_reg_strength * ||coef||_2^2 for l2_reg_strength=1 and solve with
    # optimizer. Note that the problem is well conditioned such that we get accurate
    # results.
    l2_reg_strength = 1
    fun = partial(
        linear_loss.loss,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    grad = partial(
        linear_loss.gradient,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    coef_penalized_with_intercept = _special_minimize(fun,
                                                      grad,
                                                      coef_unpenalized,
                                                      tol_NM=1e-6,
                                                      tol=1e-14)

    linear_loss = LinearModelLoss(base_loss=model._get_loss(),
                                  fit_intercept=False)
    fun = partial(
        linear_loss.loss,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    grad = partial(
        linear_loss.gradient,
        X=X[:, :-1],
        y=y,
        sample_weight=sw,
        l2_reg_strength=l2_reg_strength,
    )
    coef_penalized_without_intercept = _special_minimize(fun,
                                                         grad,
                                                         coef_unpenalized[:-1],
                                                         tol_NM=1e-6,
                                                         tol=1e-14)

    # To be sure
    assert np.linalg.norm(coef_penalized_with_intercept) < np.linalg.norm(
        coef_unpenalized)

    return (
        model,
        X,
        y,
        coef_unpenalized,
        coef_penalized_with_intercept,
        coef_penalized_without_intercept,
        l2_reg_strength,
    )
Exemplo n.º 36
0
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s=30, c='blue', marker='s')
    ax.scatter(xcord2, ycord2, s=30, c='red')
    x = arange(-3.0, 3.0, 0.1)
    y = (-weights[0] - weights[1] * x) / weights[2]
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()


def main():
    dataMat, labelMat = loadDataSet()
    weights = gradAscent(dataMat, labelMat).getA()
    plotBestFit(weights)


if __name__ == '__main__':
    main()
    X = datasets.make_low_rank_matrix(n_samples=100,
                                      n_features=100,
                                      effective_rank=10,
                                      random_state=None)
    Y = datasets.make_low_rank_matrix(n_samples=100,
                                      n_features=1,
                                      effective_rank=10,
                                      random_state=None)
    print(X)
    plt.plot(X, Y)
    plt.show()
Exemplo n.º 37
0
                         color="darkorange", lw=lw)
        plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                     color="navy", lw=lw)
        plt.fill_between(param_range, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.2,
                         color="navy", lw=lw)
        plt.legend(loc="best")
        plt.savefig('figures/syn_best_alpha_{}.png'.format(method_name))
        plt.clf()

np.random.seed(0)
n_samples = 200
n_features = 500
e_rank = 30
X = make_low_rank_matrix(n_samples=n_samples,
                         n_features=n_features + 1,
                         effective_rank=e_rank,
                         tail_strength=0.5)
X, y = X[:,:-1], X[:,-1]
print('data shape:', X.shape)



methods = [("Ridge", Ridge(), "alpha", np.logspace(-3, 0, 20)),
           ("LASSO", Lasso(), "alpha", np.logspace(-3, 0, 20)),
           ("Echo", er.EchoRegression(), 'alpha', np.logspace(-1, 4, 20)),
           ("OLS", LinearRegression(), "fit_intercept", [True])]

for method_name, method, param_name, param_range in methods:
    train_scores, test_scores = validation_curve(method, X, y, cv=10, scoring='neg_mean_squared_error',
                                                 param_name=param_name,
                                                 param_range=param_range)
Exemplo n.º 38
0
def make_varratio_exercise():
    return make_low_rank_matrix(tail_strength=0.1)