Пример #1
0
def compute_bench(data_gen, samples_range, features_range, q=3):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = make_data(n_samples, n_features)
            rank = min(n_samples, n_samples) / 10 + 1

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=0"
            tstart = time()
            fast_svd(X, rank, q=0)
            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=%d " % q
            tstart = time()
            fast_svd(X, rank, q=q)
            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)

    return results
Пример #2
0
def test_fast_svd_low_rank():
    """Check that extmath.fast_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
                          tail_strength=0.0, seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
Пример #3
0
def test_fast_svd_low_rank_with_noise():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
                          tail_strength=0.5, seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method without
    # the iterated power method
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.05

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #4
0
def fast_pseudoinverse(matrix, precision):

    if matrix.shape[0] <= matrix.shape[1]:
        val = int((precision * matrix.shape[0]) / 100)
        u, s, vt = slue.fast_svd(matrix, val)
        UT = ss.csr_matrix(np.nan_to_num(u.transpose()))
        SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s)))
        VT = ss.csr_matrix(np.nan_to_num(vt))

        temp_matrix = spmatrixmul(VT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, UT)
        del u, s, vt, UT, SI, VT, temp_matrix

    else:
        val = int((precision * matrix.transpose().shape[0]) / 100)
        u, s, vt = slue.fast_svd(matrix.transpose(), val)
        UT = ss.csr_matrix(np.nan_to_num(u.transpose()))
        SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s)))
        VT = ss.csr_matrix(np.nan_to_num(vt))

        temp_matrix = spmatrixmul(UT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, VT)
        del u, s, vt, UT, SI, VT, temp_matrix



    return pinv_matrix.tocsr()
Пример #5
0
def test_fast_svd_infinite_rank():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = low_rank_fat_tail(n_samples,
                          n_features,
                          effective_rank=rank,
                          tail_strength=1.0,
                          seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method without
    # the iterated power method
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.1

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #6
0
def test_fast_svd_infinite_rank():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
                          tail_strength=1.0, seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method without
    # the iterated power method
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.1

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #7
0
def test_fast_svd_transpose_consistency():
    """Check that transposing the design matrix has limit impact"""
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = low_rank_fat_tail(n_samples,
                          n_features,
                          effective_rank=rank,
                          tail_strength=0.5,
                          seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, rng=0)
    U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, rng=0)
    U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', rng=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1),
                        np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2),
                        np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)

    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Пример #8
0
def test_fast_svd_low_rank_with_noise():
    """Check that extmath.fast_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = low_rank_fat_tail(n_samples,
                          n_features,
                          effective_rank=rank,
                          tail_strength=0.5,
                          seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method without
    # the iterated power method
    _, sa, _ = fast_svd(X, k, q=0)

    # the approximation does not tolerate the noise:
    assert np.abs(s[:k] - sa).max() > 0.05

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = fast_svd(X, k, q=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Пример #9
0
def compute_bench(samples_range, features_range, q=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
                                  tail_strength=0.2)

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=0"
            tstart = time()
            fast_svd(X, rank, q=0)
            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=%d " % q
            tstart = time()
            fast_svd(X, rank, q=q)
            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)

    return results
Пример #10
0
def test_fast_svd_low_rank():
    """Check that extmath.fast_svd is consistent with linalg.svd"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X of approximate effective rank `rank` and no noise
    # component (very structured signal):
    X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
                          tail_strength=0.0, seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    U, s, V = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_equal(Ua.shape, (n_samples, k))
    assert_equal(sa.shape, (k,))
    assert_equal(Va.shape, (k, n_features))

    # ensure that the singular values of both methods are equal up to the real
    # rank of the matrix
    assert_almost_equal(s[:k], sa)

    # check the singular vectors too (while not checking the sign)
    assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va))

    # check the sparse matrix representation
    X = sparse.csr_matrix(X)

    # compute the singular values of X using the fast approximate method
    Ua, sa, Va = fast_svd(X, k)
    assert_almost_equal(s[:rank], sa[:rank])
Пример #11
0
def test_fast_svd_transpose_consistency():
    """Check that transposing the design matrix has limit impact"""
    n_samples = 100
    n_features = 500
    rank = 4
    k = 10

    X = low_rank_fat_tail(n_samples, n_features, effective_rank=rank,
                          tail_strength=0.5, seed=0)
    assert_equal(X.shape, (n_samples, n_features))

    U1, s1, V1 = fast_svd(X, k, q=3, transpose=False, rng=0)
    U2, s2, V2 = fast_svd(X, k, q=3, transpose=True, rng=0)
    U3, s3, V3 = fast_svd(X, k, q=3, transpose='auto', rng=0)
    U4, s4, V4 = linalg.svd(X, full_matrices=False)

    assert_almost_equal(s1, s4[:k], decimal=3)
    assert_almost_equal(s2, s4[:k], decimal=3)
    assert_almost_equal(s3, s4[:k], decimal=3)

    assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)
    assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]),
                        decimal=2)


    # in this case 'auto' is equivalent to transpose
    assert_almost_equal(s2, s3)
Пример #12
0
def fast_pseudoinverse(matrix, precision):

    if matrix.shape[0] <= matrix.shape[1]:
        val = int((precision * matrix.shape[0]) / 100)
        u, s, vt = slue.fast_svd(matrix, val)
        UT = ss.csr_matrix(np.nan_to_num(u.transpose()))
        SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s)))
        VT = ss.csr_matrix(np.nan_to_num(vt))

        temp_matrix = spmatrixmul(VT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, UT)
        del u, s, vt, UT, SI, VT, temp_matrix

    else:
        val = int((precision * matrix.transpose().shape[0]) / 100)
        u, s, vt = slue.fast_svd(matrix.transpose(), val)
        UT = ss.csr_matrix(np.nan_to_num(u.transpose()))
        SI = ss.csr_matrix(np.nan_to_num(np.diag(1 / s)))
        VT = ss.csr_matrix(np.nan_to_num(vt))

        temp_matrix = spmatrixmul(UT.transpose(), SI)
        pinv_matrix = spmatrixmul(temp_matrix, VT)
        del u, s, vt, UT, SI, VT, temp_matrix

    return pinv_matrix.tocsr()
Пример #13
0
def compute_bench(samples_range, features_range, q=3, rank=50):

    it = 0

    results = defaultdict(lambda: [])

    max_it = len(samples_range) * len(features_range)
    for n_samples in samples_range:
        for n_features in features_range:
            it += 1
            print '===================='
            print 'Iteration %03d of %03d' % (it, max_it)
            print '===================='
            X = low_rank_fat_tail(n_samples,
                                  n_features,
                                  effective_rank=rank,
                                  tail_strength=0.2)

            gc.collect()
            print "benching scipy svd: "
            tstart = time()
            svd(X, full_matrices=False)
            results['scipy svd'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=0"
            tstart = time()
            fast_svd(X, rank, q=0)
            results['scikit-learn fast_svd (q=0)'].append(time() - tstart)

            gc.collect()
            print "benching scikit-learn fast_svd: q=%d " % q
            tstart = time()
            fast_svd(X, rank, q=q)
            results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart)

    return results
Пример #14
0
def pca(data, fast = False, output_dimension = None):
    """Perform PCA using SVD.
    data - MxN matrix of input data
    (M dimensions, N trials)
    signals - MxN matrix of projected data
    PC - each column is a PC
    V - Mx1 matrix of variances
    """
    print "Performing PCA with a SVD based algorithm"
    N, M = data.shape
    Y = data
    if fast is True and sklearn is True:
        if output_dimension is None:
            messages.warning_exit('When using fast_svd it is necessary to '
                                  'define the output_dimension')
        u, S, PC = fast_svd(Y, output_dimension, q = 3)
    else:
        u, S, PC = scipy.linalg.svd(Y, full_matrices = False)
    
    v = PC.T
    V = S ** 2
    return v,V
Пример #15
0
 def svd(X):
     return fast_svd(X, p, q=3)
Пример #16
0
 def svd(X):
     return fast_svd(X, p, q = 3)
Пример #17
0
def svd_pca(data, fast = False, output_dimension = None, centre = None,
            auto_transpose = True):
    """Perform PCA using SVD.
    
    Parameters
    ----------
    data : numpy array
        MxN array of input data (M variables, N trials)
    fast : bool
        Wheter to use randomized svd estimation to estimate a limited number of
        componentes given by output_dimension
    output_dimension : int
        Number of components to estimate when fast is True
    centre : None | 'variables' | 'trials'
        If None no centring is applied. If 'variable' the centring will be
        performed in the variable axis. If 'trials', the centring will be 
        performed in the 'trials' axis.
    auto_transpose : bool
        If True, automatically transposes the data to boost performance
    
    Returns
    -------
    
    factors : numpy array
    loadings : numpy array
    explained_variance : numpy array
    mean : numpy array or None (if center is None)
    """
    N, M = data.shape
    if centre is not None:
        if centre == 'variables':
            mean = data.mean(1)[:,np.newaxis]
        elif centre == 'trials':
            mean = data.mean(0)[np.newaxis,:]
        else:
            raise AttributeError(
                'centre must be one of: None, variables, trials')
        data -= mean
    else:
        mean = None 
    if auto_transpose is True:
        if N < M:
            print("Auto transposing the data")
            data = data.T
        else:
            auto_transpose = False
    if fast is True and sklearn is True:
        if output_dimension is None:
            messages.warning_exit('When using fast_svd it is necessary to '
                                  'define the output_dimension')
        U, S, V = fast_svd(data, output_dimension)
    else:
        U, S, V = scipy.linalg.svd(data, full_matrices = False)
    if auto_transpose is False:
        factors = V.T
        explained_variance = S ** 2 / N
        loadings = U * S
    else:
        loadings = V.T
        explained_variance = S ** 2 / N
        factors = U * S
    return factors, loadings, explained_variance, mean
        X[i, j] = 1.0
    del links
    print "Converting to CSR representation"
    X = X.tocsr()
    print "CSR conversion done"
    return X, redirects, index_map


# stop after 5M links to make it possible to work in RAM
X, redirects, index_map = get_adjacency_matrix(
    redirects_filename, page_links_filename, limit=5000000)
names = dict((i, name) for name, i in index_map.iteritems())

print "Computing the principal singular vectors using fast_svd"
t0 = time()
U, s, V = fast_svd(X, 5, q=3)
print "done in %0.3fs" % (time() - t0)

# print the names of the wikipedia related strongest compenents of the the
# principal singular vector which should be similar to the highest eigenvector
print "Top wikipedia pages according to principal singular vectors"
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])


def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
    """Power iteration computation of the principal eigenvector

    This method is also known as Google PageRank and the implementation
    is based on the one from the NetworkX project (BSD licensed too)
    with copyrights by:
Пример #19
0
 def svd(X):
     return fast_svd(X, p)
Пример #20
0
    del links
    print "Converting to CSR representation"
    X = X.tocsr()
    print "CSR conversion done"
    return X, redirects, index_map


# stop after 5M links to make it possible to work in RAM
X, redirects, index_map = get_adjacency_matrix(redirects_filename,
                                               page_links_filename,
                                               limit=5000000)
names = dict((i, name) for name, i in index_map.iteritems())

print "Computing the principal singular vectors using fast_svd"
t0 = time()
U, s, V = fast_svd(X, 5, q=3)
print "done in %0.3fs" % (time() - t0)

# print the names of the wikipedia related strongest compenents of the the
# principal singular vector which should be similar to the highest eigenvector
print "Top wikipedia pages according to principal singular vectors"
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])


def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
    """Power iteration computation of the principal eigenvector

    This method is also known as Google PageRank and the implementation
    is based on the one from the NetworkX project (BSD licensed too)
    with copyrights by: