def test_randomized_svd_power_iteration_normalizer(): # randomized_svd with power_iteration_normalized='none' diverges for # large number of power iterations on this dataset rng = np.random.RandomState(42) X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng) X += 3 * rng.randint(0, 2, size=X.shape) n_components = 50 # Check that it diverges with many (non-normalized) power iterations U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') U, s, V = randomized_svd(X, n_components, n_iter=20, power_iteration_normalizer='none') A = X - U.dot(np.diag(s).dot(V)) error_20 = linalg.norm(A, ord='fro') assert_greater(np.abs(error_2 - error_20), 100) for normalizer in ['LU', 'QR', 'auto']: U, s, V = randomized_svd(X, n_components, n_iter=2, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error_2 = linalg.norm(A, ord='fro') for i in [5, 10, 50]: U, s, V = randomized_svd(X, n_components, n_iter=i, power_iteration_normalizer=normalizer, random_state=0) A = X - U.dot(np.diag(s).dot(V)) error = linalg.norm(A, ord='fro') assert_greater(15, np.abs(error_2 - error))
def bench_b(power_list): n_samples, n_features = 1000, 10000 data_params = {'n_samples': n_samples, 'n_features': n_features, 'tail_strength': .7, 'random_state': random_state} dataset_name = "low rank matrix %d x %d" % (n_samples, n_features) ranks = [10, 50, 100] if enable_spectral_norm: all_spectral = defaultdict(list) all_frobenius = defaultdict(list) for rank in ranks: X = make_low_rank_matrix(effective_rank=rank, **data_params) if enable_spectral_norm: X_spectral_norm = norm_diff(X, norm=2, msg=False) X_fro_norm = norm_diff(X, norm='fro', msg=False) for n_comp in [np.int(rank/2), rank, rank*2]: label = "rank=%d, n_comp=%d" % (rank, n_comp) print(label) for pi in power_list: U, s, V, _ = svd_timing(X, n_comp, n_iter=pi, n_oversamples=2, power_iteration_normalizer='LU') if enable_spectral_norm: A = U.dot(np.diag(s).dot(V)) all_spectral[label].append(norm_diff(X - A, norm=2) / X_spectral_norm) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if enable_spectral_norm: title = "%s: spectral norm diff vs n power iteration" % (dataset_name) plot_power_iter_vs_s(power_iter, all_spectral, title) title = "%s: frobenius norm diff vs n power iteration" % (dataset_name) plot_power_iter_vs_s(power_iter, all_frobenius, title)
def tableresults(): samples_range = np.array([20000]) features_range = np.array([10000]) iter_range = np.arange(0, 6, 1).astype(np.int) it = 0 results = [] max_it = len(samples_range) * len(features_range)*len(iter_range) for n_samples in samples_range: for n_features in features_range: for iter in iter_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=500, tail_strength=0.2) s1,v1,d1 = svd(X, full_matrices=False) value1 = v1[0] print("benchmarking randomized_svd: n_iter= %d" %iter) s2,v2,d2=randomized_svd(X, 1000, n_iter=iter) value2 = v2[0] results.append(value2/value1) gc.collect() return results
def test_fast_svd_low_rank(): """Check that extmath.fast_svd is consistent with linalg.svd""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the real # rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = fast_svd(X, k) assert_almost_equal(s[:rank], sa[:rank])
def test_randomized_svd_transpose_consistency(): # Check that transposing the design matrix has limited impact n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def tableresults(): samples_range = np.array([1000000]) features_range = np.arange(100, 2100, 400) iter_range = np.array([3]) it = 0 results = [] max_it = len(samples_range) * len(features_range) * len(iter_range) for n_samples in samples_range: for n_features in features_range: for iter in iter_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=500, tail_strength=0.2) gc.collect() print("benchmarking randomized_svd: n_iter= %d" % iter) tstart = time() randomized_svd(X, 500, n_iter=iter) results.append(time() - tstart) gc.collect() return results
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5): timeset = defaultdict(lambda: []) err = defaultdict(lambda: []) for n_samples in samples_range: for n_features in features_range: print("%2d samples, %2d features" % (n_samples, n_features)) print('=======================') X = np.abs(make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2)) gc.collect() print("benchmarking nndsvd-nmf: ") tstart = time() m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) tend = time() - tstart timeset['nndsvd-nmf'].append(tend) err['nndsvd-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking nndsvda-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvda-nmf'].append(tend) err['nndsvda-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking nndsvdar-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvdar-nmf'].append(tend) err['nndsvdar-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking random-nmf") tstart = time() m = NMF(n_components=30, init='random', max_iter=1000, tol=tolerance).fit(X) tend = time() - tstart timeset['random-nmf'].append(tend) err['random-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking alt-random-nmf") tstart = time() W, H = alt_nnmf(X, r=30, init='random', tol=tolerance) tend = time() - tstart timeset['alt-random-nmf'].append(tend) err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) report(norm(X - np.dot(W, H)), tend) return timeset, err
def test_randomized_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_randomized_svd_low_rank_with_noise(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.1, random_state=0) assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate # method without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0) # the approximation does not tolerate the noise: assert np.abs(s[:k] - sa).max() > 0.01 # compute the singular values of X using the fast approximate # method with iterated power method _, sap, _ = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_low_rank_with_noise(): """Check that extmath.randomized_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.05) # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_infinite_rank(): """Check that extmath.randomized_svd can handle noisy matrices""" n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.1) # compute the singular values of X using the fast approximate method with # iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5) # the iterated power method is still managing to get most of the structure # at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def compute_bench(samples_range, features_range, q=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print "benching scipy svd: " tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=0" tstart = time() fast_svd(X, rank, q=0) results['scikit-learn fast_svd (q=0)'].append(time() - tstart) gc.collect() print "benching scikit-learn fast_svd: q=%d " % q tstart = time() fast_svd(X, rank, q=q) results['scikit-learn fast_svd (q=%d)' % q].append(time() - tstart) return results
def timing(X,rank, n_oversamples=10, n_iter=3): X = make_low_rank_matrix(1000,1000,effective_rank=1000,tail_strength=0.2) gc.collect() tstart = time() randomized_svd(X, rank, n_iter=n_iter) results = float(time() - tstart) print("benchmarking randomized_svd: time=%f " % results)
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [ unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5) ] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity / 10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_mldata(dataset_name).data return X
def test_randomized_svd_sparse_warnings(): # randomized_svd throws a warning for lil and dok matrix rng = np.random.RandomState(42) X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng) n_components = 5 for cls in (sparse.lil_matrix, sparse.dok_matrix): X = cls(X) assert_warns_message( sparse.SparseEfficiencyWarning, "Calculating SVD of a {} is expensive. " "csr_matrix is more efficient.".format(cls.__name__), randomized_svd, X, n_components, n_iter=1, power_iteration_normalizer='none')
def compute_bench(samples_range, features_range, iter_range=[3], rank_range=[50]): it = 0 results = [] results1 = [] results2 = [] max_it = len(samples_range) * len(features_range) * len(rank_range) * len( iter_range) for n_samples in samples_range: for n_features in features_range: for rank in rank_range: for n_iter in iter_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) value = norm(X) gc.collect() print("benchmarking scipy svd: ") tstart = time() s1, v1, d1 = svd(X, full_matrices=False) value1 = v1[0] results.append(value1 / value1) gc.collect() print("benchmarking randomized_svd: n_iter=1") tstart = time() s2, v2, d2 = randomized_svd(X, rank, n_iter=1) value2 = v2[0] results1.append(value2 / value1) gc.collect() print("benchmarking randomized_svd: n_iter=%d " % iter_range[0]) tstart = time() s3, v3, d3 = randomized_svd(X, rank, n_iter=n_iter) value3 = v3[0] results2.append(value3 / value1) return results, results1, results2
def get_data(dataset_name): print("Getting dataset: %s" % dataset_name) if dataset_name == 'lfw_people': X = fetch_lfw_people().data elif dataset_name == '20newsgroups': X = fetch_20newsgroups_vectorized().data[:, :100000] elif dataset_name == 'olivetti_faces': X = fetch_olivetti_faces().data elif dataset_name == 'rcv1': X = fetch_rcv1().data elif dataset_name == 'CIFAR': if handle_missing_dataset(CIFAR_FOLDER) == "skip": return X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) for i in range(5)] X = np.vstack(X1) del X1 elif dataset_name == 'SVHN': if handle_missing_dataset(SVHN_FOLDER) == 0: return X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] X = np.vstack(X2) del X1 del X2 elif dataset_name == 'low rank matrix': X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), effective_rank=100, tail_strength=.5, random_state=random_state) elif dataset_name == 'uncorrelated matrix': X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, random_state=random_state) elif dataset_name == 'big sparse matrix': sparsity = np.int(1e6) size = np.int(1e6) small_size = np.int(1e4) data = np.random.normal(0, 1, np.int(sparsity/10)) data = np.repeat(data, 10) row = np.random.uniform(0, small_size, sparsity) col = np.random.uniform(0, small_size, sparsity) X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) del data del row del col else: X = fetch_mldata(dataset_name).data return X
def compute_bench(samples_range, features_range, iter_range=[3], rank_range=[50]): it = 0 results = [] results1 = [] results2 = [] max_it = len(samples_range) * len(features_range) * len(rank_range) * len( iter_range) for n_samples in samples_range: for n_features in features_range: for rank in rank_range: for n_iter in iter_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=50, tail_strength=0.2) gc.collect() print("benchmarking scipy svd: ") tstart = time() svd(X, full_matrices=False) results.append(time() - tstart) gc.collect() print("benchmarking randomized_svd: n_iter=1") tstart = time() randomized_svd(X, rank, n_iter=1) results1.append(time() - tstart) gc.collect() print("benchmarking randomized_svd: n_iter=%d " % iter_range[0]) tstart = time() randomized_svd(X, rank, n_iter=n_iter) results2.append(time() - tstart) return results, results1, results2
def compute_bench(samples_range, features_range, n_iterations=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print "benching scipy svd: " tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print "benching scikit-learn randomized_svd: n_iterations=0" tstart = time() randomized_svd(X, rank, n_iterations=0) results['scikit-learn randomized_svd (n_iterations=0)'].append( time() - tstart) gc.collect() print("benching scikit-learn randomized_svd: n_iterations=%d " % n_iterations) tstart = time() randomized_svd(X, rank, n_iterations=n_iterations) results['scikit-learn randomized_svd (n_iterations=%d)' % n_iterations].append(time() - tstart) return results
def compute_bench(samples_range, features_range, n_iter=3, rank=50): it = 0 results = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2) gc.collect() print("benchmarking scipy svd: ") tstart = time() svd(X, full_matrices=False) results['scipy svd'].append(time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=0") tstart = time() randomized_svd(X, rank, n_iter=0) results['scikit-learn randomized_svd (n_iter=0)'].append( time() - tstart) gc.collect() print("benchmarking scikit-learn randomized_svd: n_iter=%d " % n_iter) tstart = time() randomized_svd(X, rank, n_iter=n_iter) results['scikit-learn randomized_svd (n_iter=%d)' % n_iter].append(time() - tstart) return results
def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7): it = 0 timeset = defaultdict(lambda: []) err = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print '====================' print 'Iteration %03d of %03d' % (it, max_it) print '====================' X = np.abs( make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2)) gc.collect() print "benching nndsvd-nmf: " tstart = time() m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) tend = time() - tstart timeset['nndsvd-nmf'].append(tend) err['nndsvd-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching nndsvda-nmf: " tstart = time() m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvda-nmf'].append(tend) err['nndsvda-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching nndsvdar-nmf: " tstart = time() m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvdar-nmf'].append(tend) err['nndsvdar-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching random-nmf" tstart = time() m = NMF(n_components=30, init=None, max_iter=1000, tol=tolerance).fit(X) tend = time() - tstart timeset['random-nmf'].append(tend) err['random-nmf'].append(m.reconstruction_err_) print m.reconstruction_err_, tend gc.collect() print "benching alt-random-nmf" tstart = time() W, H = alt_nnmf(X, r=30, R=None, tol=tolerance) tend = time() - tstart timeset['alt-random-nmf'].append(tend) err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) print np.linalg.norm(X - np.dot(W, H)), tend return timeset, err
def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7): it = 0 timeset = defaultdict(lambda: []) err = defaultdict(lambda: []) max_it = len(samples_range) * len(features_range) for n_samples in samples_range: for n_features in features_range: it += 1 print('====================') print('Iteration %03d of %03d' % (it, max_it)) print('====================') X = np.abs(make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2)) gc.collect() print("benching nndsvd-nmf: ") tstart = time() m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) tend = time() - tstart timeset['nndsvd-nmf'].append(tend) err['nndsvd-nmf'].append(m.reconstruction_err_) print(m.reconstruction_err_, tend) gc.collect() print("benching nndsvda-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvda-nmf'].append(tend) err['nndsvda-nmf'].append(m.reconstruction_err_) print(m.reconstruction_err_, tend) gc.collect() print("benching nndsvdar-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvdar-nmf'].append(tend) err['nndsvdar-nmf'].append(m.reconstruction_err_) print(m.reconstruction_err_, tend) gc.collect() print("benching random-nmf") tstart = time() m = NMF(n_components=30, init=None, max_iter=1000, tol=tolerance).fit(X) tend = time() - tstart timeset['random-nmf'].append(tend) err['random-nmf'].append(m.reconstruction_err_) print(m.reconstruction_err_, tend) gc.collect() print("benching alt-random-nmf") tstart = time() W, H = alt_nnmf(X, r=30, R=None, tol=tolerance) tend = time() - tstart timeset['alt-random-nmf'].append(tend) err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) print(np.linalg.norm(X - np.dot(W, H)), tend) return timeset, err
def benchmark(samples_range, features_range, rank=50, tolerance=1e-5): timeset = defaultdict(lambda: []) err = defaultdict(lambda: []) for n_samples in samples_range: for n_features in features_range: print("%2d samples, %2d features" % (n_samples, n_features)) print('=======================') X = np.abs( make_low_rank_matrix(n_samples, n_features, effective_rank=rank, tail_strength=0.2)) gc.collect() print("benchmarking nndsvd-nmf: ") tstart = time() m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) tend = time() - tstart timeset['nndsvd-nmf'].append(tend) err['nndsvd-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking nndsvda-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvda', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvda-nmf'].append(tend) err['nndsvda-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking nndsvdar-nmf: ") tstart = time() m = NMF(n_components=30, init='nndsvdar', tol=tolerance).fit(X) tend = time() - tstart timeset['nndsvdar-nmf'].append(tend) err['nndsvdar-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking random-nmf") tstart = time() m = NMF(n_components=30, init='random', max_iter=1000, tol=tolerance).fit(X) tend = time() - tstart timeset['random-nmf'].append(tend) err['random-nmf'].append(m.reconstruction_err_) report(m.reconstruction_err_, tend) gc.collect() print("benchmarking alt-random-nmf") tstart = time() W, H = alt_nnmf(X, r=30, init='random', tol=tolerance) tend = time() - tstart timeset['alt-random-nmf'].append(tend) err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) report(norm(X - np.dot(W, H)), tend) return timeset, err
def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 decimal = 5 if dtype == np.float32 else 7 dtype = np.dtype(dtype) # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0).astype(dtype, copy=False) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) V = V.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # If the input dtype is float, then the output dtype is float of the # same bit size (f32 is not upcast to f64) # But if the input dtype is int, the output dtype is float64 if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype == np.float64 assert sa.dtype == np.float64 assert Va.dtype == np.float64 assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k, )) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype.kind == 'f' assert sa.dtype.kind == 'f' assert Va.dtype.kind == 'f' assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 decimal = 5 if dtype == np.float32 else 7 dtype = np.dtype(dtype) # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0).astype(dtype, copy=False) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) V = V.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd( X, k, power_iteration_normalizer=normalizer, random_state=0) # If the input dtype is float, then the output dtype is float of the # same bit size (f32 is not upcast to f64) # But if the input dtype is int, the output dtype is float64 if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype == np.float64 assert sa.dtype == np.float64 assert Va.dtype == np.float64 assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k,)) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype.kind == 'f' assert sa.dtype.kind == 'f' assert Va.dtype.kind == 'f' assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)