Пример #1
0
def test_randomized_svd_power_iteration_normalizer():
    # randomized_svd with power_iteration_normalized='none' diverges for
    # large number of power iterations on this dataset
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
    X += 3 * rng.randint(0, 2, size=X.shape)
    n_components = 50

    # Check that it diverges with many (non-normalized) power iterations
    U, s, V = randomized_svd(X, n_components, n_iter=2,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_2 = linalg.norm(A, ord='fro')
    U, s, V = randomized_svd(X, n_components, n_iter=20,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_20 = linalg.norm(A, ord='fro')
    assert_greater(np.abs(error_2 - error_20), 100)

    for normalizer in ['LU', 'QR', 'auto']:
        U, s, V = randomized_svd(X, n_components, n_iter=2,
                                 power_iteration_normalizer=normalizer,
                                 random_state=0)
        A = X - U.dot(np.diag(s).dot(V))
        error_2 = linalg.norm(A, ord='fro')

        for i in [5, 10, 50]:
            U, s, V = randomized_svd(X, n_components, n_iter=i,
                                     power_iteration_normalizer=normalizer,
                                     random_state=0)
            A = X - U.dot(np.diag(s).dot(V))
            error = linalg.norm(A, ord='fro')
            assert_greater(15, np.abs(error_2 - error))
Пример #2
0
def bench_b(power_list):

    n_samples, n_features = 1000, 10000
    data_params = {'n_samples': n_samples, 'n_features': n_features,
                   'tail_strength': .7, 'random_state': random_state}
    dataset_name = "low rank matrix %d x %d" % (n_samples, n_features)
    ranks = [10, 50, 100]

    if enable_spectral_norm:
        all_spectral = defaultdict(list)
    all_frobenius = defaultdict(list)
    for rank in ranks:
        X = make_low_rank_matrix(effective_rank=rank, **data_params)
        if enable_spectral_norm:
            X_spectral_norm = norm_diff(X, norm=2, msg=False)
        X_fro_norm = norm_diff(X, norm='fro', msg=False)

        for n_comp in [np.int(rank/2), rank, rank*2]:
            label = "rank=%d, n_comp=%d" % (rank, n_comp)
            print(label)
            for pi in power_list:
                U, s, V, _ = svd_timing(X, n_comp, n_iter=pi, n_oversamples=2,
                                        power_iteration_normalizer='LU')
                if enable_spectral_norm:
                    A = U.dot(np.diag(s).dot(V))
                    all_spectral[label].append(norm_diff(X - A, norm=2) /
                                               X_spectral_norm)
                f = scalable_frobenius_norm_discrepancy(X, U, s, V)
                all_frobenius[label].append(f / X_fro_norm)

    if enable_spectral_norm:
        title = "%s: spectral norm diff vs n power iteration" % (dataset_name)
        plot_power_iter_vs_s(power_iter, all_spectral, title)
    title = "%s: frobenius norm diff vs n power iteration" % (dataset_name)
    plot_power_iter_vs_s(power_iter, all_frobenius, title)
Пример #3
0
def tableresults():
    samples_range = np.array([20000])
    features_range = np.array([10000])
    iter_range = np.arange(0, 6, 1).astype(np.int)
    
    it = 0
    results = []
      
    max_it = len(samples_range) * len(features_range)*len(iter_range)
    for n_samples in samples_range:
        for n_features in features_range:
            for iter in iter_range:
                it += 1
                print('====================')
                print('Iteration %03d of %03d' % (it, max_it))
                print('====================')
                X = make_low_rank_matrix(n_samples, n_features,
                                      effective_rank=500,
                                      tail_strength=0.2)
               
                s1,v1,d1 = svd(X, full_matrices=False)
                value1 = v1[0]
                print("benchmarking randomized_svd: n_iter= %d" %iter)
                s2,v2,d2=randomized_svd(X, 1000, n_iter=iter)
                value2 = v2[0]
                results.append(value2/value1)
    
                gc.collect()
    return results       
Пример #4
0
def test_fast_svd_low_rank():
    """Check that extmath.fast_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
        effective_rank=rank, tail_strength=0.0, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
Пример #5
0
def test_randomized_svd_transpose_consistency():
    # Check that transposing the design matrix has limited impact
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False,
                                random_state=0)
    U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True,
                                random_state=0)
    U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto',
                                random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
def tableresults():
    samples_range = np.array([1000000])
    features_range = np.arange(100, 2100, 400)
    iter_range = np.array([3])

    it = 0
    results = []

    max_it = len(samples_range) * len(features_range) * len(iter_range)
    for n_samples in samples_range:
        for n_features in features_range:
            for iter in iter_range:
                it += 1
                print('====================')
                print('Iteration %03d of %03d' % (it, max_it))
                print('====================')
                X = make_low_rank_matrix(n_samples,
                                         n_features,
                                         effective_rank=500,
                                         tail_strength=0.2)

                gc.collect()
                print("benchmarking randomized_svd: n_iter= %d" % iter)
                tstart = time()
                randomized_svd(X, 500, n_iter=iter)
                results.append(time() - tstart)

                gc.collect()
    return results
Пример #7
0
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
    timeset = defaultdict(lambda: [])
    err = defaultdict(lambda: [])

    for n_samples in samples_range:
        for n_features in features_range:
            print("%2d samples, %2d features" % (n_samples, n_features))
            print('=======================')
            X = np.abs(make_low_rank_matrix(n_samples, n_features,
                       effective_rank=rank, tail_strength=0.2))

            gc.collect()
            print("benchmarking nndsvd-nmf: ")
            tstart = time()
            m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
            tend = time() - tstart
            timeset['nndsvd-nmf'].append(tend)
            err['nndsvd-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking nndsvda-nmf: ")
            tstart = time()
            m = NMF(n_components=30, init='nndsvda',
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvda-nmf'].append(tend)
            err['nndsvda-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking nndsvdar-nmf: ")
            tstart = time()
            m = NMF(n_components=30, init='nndsvdar',
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvdar-nmf'].append(tend)
            err['nndsvdar-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking random-nmf")
            tstart = time()
            m = NMF(n_components=30, init='random', max_iter=1000,
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['random-nmf'].append(tend)
            err['random-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking alt-random-nmf")
            tstart = time()
            W, H = alt_nnmf(X, r=30, init='random', tol=tolerance)
            tend = time() - tstart
            timeset['alt-random-nmf'].append(tend)
            err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
            report(norm(X - np.dot(W, H)), tend)

    return timeset, err
Пример #8
0
def test_fast_svd_low_rank():
    """Check that extmath.fast_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.0, random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
Пример #9
0
def test_randomized_svd_power_iteration_normalizer():
    # randomized_svd with power_iteration_normalized='none' diverges for
    # large number of power iterations on this dataset
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
    X += 3 * rng.randint(0, 2, size=X.shape)
    n_components = 50

    # Check that it diverges with many (non-normalized) power iterations
    U, s, V = randomized_svd(X, n_components, n_iter=2,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_2 = linalg.norm(A, ord='fro')
    U, s, V = randomized_svd(X, n_components, n_iter=20,
                             power_iteration_normalizer='none')
    A = X - U.dot(np.diag(s).dot(V))
    error_20 = linalg.norm(A, ord='fro')
    assert_greater(np.abs(error_2 - error_20), 100)

    for normalizer in ['LU', 'QR', 'auto']:
        U, s, V = randomized_svd(X, n_components, n_iter=2,
                                 power_iteration_normalizer=normalizer,
                                 random_state=0)
        A = X - U.dot(np.diag(s).dot(V))
        error_2 = linalg.norm(A, ord='fro')

        for i in [5, 10, 50]:
            U, s, V = randomized_svd(X, n_components, n_iter=i,
                                     power_iteration_normalizer=normalizer,
                                     random_state=0)
            A = X - U.dot(np.diag(s).dot(V))
            error = linalg.norm(A, ord='fro')
            assert_greater(15, np.abs(error_2 - error))
Пример #10
0
def test_randomized_svd_transpose_consistency():
    """Check that transposing the design matrix has limit impact"""
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False,
                                random_state=0)
    U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True,
                                random_state=0)
    U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto',
                                random_state=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Пример #11
0
def test_randomized_svd_low_rank_with_noise():
    # Check that extmath.randomized_svd can handle noisy matrices
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.1,
                             random_state=0)
    assert X.shape == (n_samples, n_features)

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    for normalizer in ['auto', 'none', 'LU', 'QR']:
        # compute the singular values of X using the fast approximate
        # method without the iterated power method
        _, sa, _ = randomized_svd(X, k, n_iter=0,
                                  power_iteration_normalizer=normalizer,
                                  random_state=0)

        # the approximation does not tolerate the noise:
        assert np.abs(s[:k] - sa).max() > 0.01

        # compute the singular values of X using the fast approximate
        # method with iterated power method
        _, sap, _ = randomized_svd(X, k,
                                   power_iteration_normalizer=normalizer,
                                   random_state=0)

        # the iterated power method is helping getting rid of the noise:
        assert_almost_equal(s[:k], sap, decimal=3)
Пример #12
0
def test_randomized_svd_low_rank_with_noise():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.05)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #13
0
def test_randomized_svd_infinite_rank():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=1.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.1)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #14
0
def compute_bench(samples_range, features_range, q=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank,
                                  tail_strength=0.2)

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=0"
            tstart = time()
            fast_svd(X, rank, q=0)
            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=%d " % q
            tstart = time()
            fast_svd(X, rank, q=q)
            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)

    return results
Пример #15
0
def test_randomized_svd_low_rank_with_noise():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.05)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #16
0
def test_randomized_svd_infinite_rank():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=1.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.1)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #17
0
def timing(X,rank, n_oversamples=10, n_iter=3):
    X = make_low_rank_matrix(1000,1000,effective_rank=1000,tail_strength=0.2)
    gc.collect()
    tstart = time()
    randomized_svd(X, rank, n_iter=n_iter)
    results = float(time() - tstart)
    print("benchmarking randomized_svd: time=%f "
                  % results)
Пример #18
0
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500,
                                 n_features=np.int(1e4),
                                 effective_rank=100,
                                 tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_mldata(dataset_name).data
    return X
Пример #19
0
def test_randomized_svd_sparse_warnings():
    # randomized_svd throws a warning for lil and dok matrix
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
    n_components = 5
    for cls in (sparse.lil_matrix, sparse.dok_matrix):
        X = cls(X)
        assert_warns_message(
            sparse.SparseEfficiencyWarning,
            "Calculating SVD of a {} is expensive. "
            "csr_matrix is more efficient.".format(cls.__name__),
            randomized_svd, X, n_components, n_iter=1,
            power_iteration_normalizer='none')
Пример #20
0
def test_randomized_svd_sparse_warnings():
    # randomized_svd throws a warning for lil and dok matrix
    rng = np.random.RandomState(42)
    X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
    n_components = 5
    for cls in (sparse.lil_matrix, sparse.dok_matrix):
        X = cls(X)
        assert_warns_message(
            sparse.SparseEfficiencyWarning,
            "Calculating SVD of a {} is expensive. "
            "csr_matrix is more efficient.".format(cls.__name__),
            randomized_svd, X, n_components, n_iter=1,
            power_iteration_normalizer='none')
def compute_bench(samples_range,
                  features_range,
                  iter_range=[3],
                  rank_range=[50]):

    it = 0

    results = []
    results1 = []
    results2 = []

    max_it = len(samples_range) * len(features_range) * len(rank_range) * len(
        iter_range)
    for n_samples in samples_range:
        for n_features in features_range:
            for rank in rank_range:
                for n_iter in iter_range:
                    it += 1
                    print('====================')
                    print('Iteration %03d of %03d' % (it, max_it))
                    print('====================')
                    X = make_low_rank_matrix(n_samples,
                                             n_features,
                                             effective_rank=rank,
                                             tail_strength=0.2)
                    value = norm(X)
                    gc.collect()
                    print("benchmarking scipy svd: ")
                    tstart = time()
                    s1, v1, d1 = svd(X, full_matrices=False)
                    value1 = v1[0]
                    results.append(value1 / value1)

                    gc.collect()
                    print("benchmarking randomized_svd: n_iter=1")
                    tstart = time()
                    s2, v2, d2 = randomized_svd(X, rank, n_iter=1)
                    value2 = v2[0]
                    results1.append(value2 / value1)

                    gc.collect()
                    print("benchmarking randomized_svd: n_iter=%d " %
                          iter_range[0])
                    tstart = time()
                    s3, v3, d3 = randomized_svd(X, rank, n_iter=n_iter)
                    value3 = v3[0]
                    results2.append(value3 / value1)

    return results, results1, results2
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == 'lfw_people':
        X = fetch_lfw_people().data
    elif dataset_name == '20newsgroups':
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == 'olivetti_faces':
        X = fetch_olivetti_faces().data
    elif dataset_name == 'rcv1':
        X = fetch_rcv1().data
    elif dataset_name == 'CIFAR':
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
              for i in range(5)]
        X = np.vstack(X1)
        del X1
    elif dataset_name == 'SVHN':
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X']
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == 'low rank matrix':
        X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4),
                                 effective_rank=100, tail_strength=.5,
                                 random_state=random_state)
    elif dataset_name == 'uncorrelated matrix':
        X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000,
                                        random_state=random_state)
    elif dataset_name == 'big sparse matrix':
        sparsity = np.int(1e6)
        size = np.int(1e6)
        small_size = np.int(1e4)
        data = np.random.normal(0, 1, np.int(sparsity/10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_mldata(dataset_name).data
    return X
def compute_bench(samples_range,
                  features_range,
                  iter_range=[3],
                  rank_range=[50]):

    it = 0

    results = []
    results1 = []
    results2 = []

    max_it = len(samples_range) * len(features_range) * len(rank_range) * len(
        iter_range)
    for n_samples in samples_range:
        for n_features in features_range:
            for rank in rank_range:
                for n_iter in iter_range:
                    it += 1
                    print('====================')
                    print('Iteration %03d of %03d' % (it, max_it))
                    print('====================')
                    X = make_low_rank_matrix(n_samples,
                                             n_features,
                                             effective_rank=50,
                                             tail_strength=0.2)

                    gc.collect()
                    print("benchmarking scipy svd: ")
                    tstart = time()
                    svd(X, full_matrices=False)
                    results.append(time() - tstart)

                    gc.collect()
                    print("benchmarking randomized_svd: n_iter=1")
                    tstart = time()
                    randomized_svd(X, rank, n_iter=1)
                    results1.append(time() - tstart)

                    gc.collect()
                    print("benchmarking randomized_svd: n_iter=%d " %
                          iter_range[0])
                    tstart = time()
                    randomized_svd(X, rank, n_iter=n_iter)
                    results2.append(time() - tstart)

    return results, results1, results2
Пример #24
0
def compute_bench(samples_range, features_range, n_iterations=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = make_low_rank_matrix(n_samples,
                                     n_features,
                                     effective_rank=rank,
                                     tail_strength=0.2)

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn randomized_svd: n_iterations=0"
            tstart = time()
            randomized_svd(X, rank, n_iterations=0)
            results['scikit-learn randomized_svd (n_iterations=0)'].append(
                time() - tstart)

            gc.collect()
            print("benching scikit-learn randomized_svd: n_iterations=%d " %
                  n_iterations)
            tstart = time()
            randomized_svd(X, rank, n_iterations=n_iterations)
            results['scikit-learn randomized_svd (n_iterations=%d)' %
                    n_iterations].append(time() - tstart)

    return results
Пример #25
0
def compute_bench(samples_range, features_range, n_iter=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('====================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('====================')
            X = make_low_rank_matrix(n_samples, n_features,
                                  effective_rank=rank,
                                  tail_strength=0.2)

            gc.collect()
            print("benchmarking scipy svd: ")
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=0")
            tstart = time()
            randomized_svd(X, rank, n_iter=0)
            results['scikit-learn randomized_svd (n_iter=0)'].append(
                time() - tstart)

            gc.collect()
            print("benchmarking scikit-learn randomized_svd: n_iter=%d "
                  % n_iter)
            tstart = time()
            randomized_svd(X, rank, n_iter=n_iter)
            results['scikit-learn randomized_svd (n_iter=%d)'
                    % n_iter].append(time() - tstart)

    return results
Пример #26
0
def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
    it = 0
    timeset = defaultdict(lambda: [])
    err = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = np.abs(
                make_low_rank_matrix(n_samples,
                                     n_features,
                                     effective_rank=rank,
                                     tail_strength=0.2))

            gc.collect()
            print "benching nndsvd-nmf: "
            tstart = time()
            m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
            tend = time() - tstart
            timeset['nndsvd-nmf'].append(tend)
            err['nndsvd-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching nndsvda-nmf: "
            tstart = time()
            m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvda-nmf'].append(tend)
            err['nndsvda-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching nndsvdar-nmf: "
            tstart = time()
            m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvdar-nmf'].append(tend)
            err['nndsvdar-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching random-nmf"
            tstart = time()
            m = NMF(n_components=30, init=None, max_iter=1000,
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['random-nmf'].append(tend)
            err['random-nmf'].append(m.reconstruction_err_)
            print m.reconstruction_err_, tend

            gc.collect()
            print "benching alt-random-nmf"
            tstart = time()
            W, H = alt_nnmf(X, r=30, R=None, tol=tolerance)
            tend = time() - tstart
            timeset['alt-random-nmf'].append(tend)
            err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
            print np.linalg.norm(X - np.dot(W, H)), tend

    return timeset, err
Пример #27
0
def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7):
    it = 0
    timeset = defaultdict(lambda: [])
    err = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print('====================')
            print('Iteration %03d of %03d' % (it, max_it))
            print('====================')
            X = np.abs(make_low_rank_matrix(n_samples, n_features,
                       effective_rank=rank,  tail_strength=0.2))

            gc.collect()
            print("benching nndsvd-nmf: ")
            tstart = time()
            m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
            tend = time() - tstart
            timeset['nndsvd-nmf'].append(tend)
            err['nndsvd-nmf'].append(m.reconstruction_err_)
            print(m.reconstruction_err_, tend)

            gc.collect()
            print("benching nndsvda-nmf: ")
            tstart = time()
            m = NMF(n_components=30, init='nndsvda',
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvda-nmf'].append(tend)
            err['nndsvda-nmf'].append(m.reconstruction_err_)
            print(m.reconstruction_err_, tend)

            gc.collect()
            print("benching nndsvdar-nmf: ")
            tstart = time()
            m = NMF(n_components=30, init='nndsvdar',
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvdar-nmf'].append(tend)
            err['nndsvdar-nmf'].append(m.reconstruction_err_)
            print(m.reconstruction_err_, tend)

            gc.collect()
            print("benching random-nmf")
            tstart = time()
            m = NMF(n_components=30, init=None, max_iter=1000,
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['random-nmf'].append(tend)
            err['random-nmf'].append(m.reconstruction_err_)
            print(m.reconstruction_err_, tend)

            gc.collect()
            print("benching alt-random-nmf")
            tstart = time()
            W, H = alt_nnmf(X, r=30, R=None, tol=tolerance)
            tend = time() - tstart
            timeset['alt-random-nmf'].append(tend)
            err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
            print(np.linalg.norm(X - np.dot(W, H)), tend)

    return timeset, err
Пример #28
0
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5):
    timeset = defaultdict(lambda: [])
    err = defaultdict(lambda: [])

    for n_samples in samples_range:
        for n_features in features_range:
            print("%2d samples, %2d features" % (n_samples, n_features))
            print('=======================')
            X = np.abs(
                make_low_rank_matrix(n_samples,
                                     n_features,
                                     effective_rank=rank,
                                     tail_strength=0.2))

            gc.collect()
            print("benchmarking nndsvd-nmf: ")
            tstart = time()
            m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X)
            tend = time() - tstart
            timeset['nndsvd-nmf'].append(tend)
            err['nndsvd-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking nndsvda-nmf: ")
            tstart = time()
            m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvda-nmf'].append(tend)
            err['nndsvda-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking nndsvdar-nmf: ")
            tstart = time()
            m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['nndsvdar-nmf'].append(tend)
            err['nndsvdar-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking random-nmf")
            tstart = time()
            m = NMF(n_components=30,
                    init='random',
                    max_iter=1000,
                    tol=tolerance).fit(X)
            tend = time() - tstart
            timeset['random-nmf'].append(tend)
            err['random-nmf'].append(m.reconstruction_err_)
            report(m.reconstruction_err_, tend)

            gc.collect()
            print("benchmarking alt-random-nmf")
            tstart = time()
            W, H = alt_nnmf(X, r=30, init='random', tol=tolerance)
            tend = time() - tstart
            timeset['alt-random-nmf'].append(tend)
            err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H)))
            report(norm(X - np.dot(W, H)), tend)

    return timeset, err
Пример #29
0
def check_randomized_svd_low_rank(dtype):
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10
    decimal = 5 if dtype == np.float32 else 7
    dtype = np.dtype(dtype)

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples,
                             n_features=n_features,
                             effective_rank=rank,
                             tail_strength=0.0,
                             random_state=0).astype(dtype, copy=False)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # Convert the singular values to the specific dtype
    U = U.astype(dtype, copy=False)
    s = s.astype(dtype, copy=False)
    V = V.astype(dtype, copy=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = randomized_svd(X,
                                    k,
                                    power_iteration_normalizer=normalizer,
                                    random_state=0)

        # If the input dtype is float, then the output dtype is float of the
        # same bit size (f32 is not upcast to f64)
        # But if the input dtype is int, the output dtype is float64
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype == np.float64
            assert sa.dtype == np.float64
            assert Va.dtype == np.float64

        assert_equal(Ua.shape, (n_samples, k))
        assert_equal(sa.shape, (k, ))
        assert_equal(Va.shape, (k, n_features))

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa, decimal=decimal)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]),
                            np.dot(Ua, Va),
                            decimal=decimal)

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype.kind == 'f'
            assert sa.dtype.kind == 'f'
            assert Va.dtype.kind == 'f'

        assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
Пример #30
0
def check_randomized_svd_low_rank(dtype):
    # Check that extmath.randomized_svd is consistent with linalg.svd
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10
    decimal = 5 if dtype == np.float32 else 7
    dtype = np.dtype(dtype)

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.0,
                             random_state=0).astype(dtype, copy=False)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # Convert the singular values to the specific dtype
    U = U.astype(dtype, copy=False)
    s = s.astype(dtype, copy=False)
    V = V.astype(dtype, copy=False)

    for normalizer in ['auto', 'LU', 'QR']:  # 'none' would not be stable
        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = randomized_svd(
            X, k, power_iteration_normalizer=normalizer, random_state=0)

        # If the input dtype is float, then the output dtype is float of the
        # same bit size (f32 is not upcast to f64)
        # But if the input dtype is int, the output dtype is float64
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype == np.float64
            assert sa.dtype == np.float64
            assert Va.dtype == np.float64

        assert_equal(Ua.shape, (n_samples, k))
        assert_equal(sa.shape, (k,))
        assert_equal(Va.shape, (k, n_features))

        # ensure that the singular values of both methods are equal up to the
        # real rank of the matrix
        assert_almost_equal(s[:k], sa, decimal=decimal)

        # check the singular vectors too (while not checking the sign)
        assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va),
                            decimal=decimal)

        # check the sparse matrix representation
        X = sparse.csr_matrix(X)

        # compute the singular values of X using the fast approximate method
        Ua, sa, Va = \
            randomized_svd(X, k, power_iteration_normalizer=normalizer,
                           random_state=0)
        if dtype.kind == 'f':
            assert Ua.dtype == dtype
            assert sa.dtype == dtype
            assert Va.dtype == dtype
        else:
            assert Ua.dtype.kind == 'f'
            assert sa.dtype.kind == 'f'
            assert Va.dtype.kind == 'f'

        assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)