Пример #1
0
    def test_block_rdd_array_block_size(self):
        n_partitions = 10
        n_samples = 107
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
            n_partitions)

        block_data_5 = block_rdd(data, block_size=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))

        block_data_10 = block_rdd(data, block_size=10)
        blocks = block_data_10.collect()
        assert_true(all(len(b) <= 10 for b in blocks))
Пример #2
0
    def test_block_rdd_array_block_size(self):
        n_partitions = 10
        n_samples = 107
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)

        block_data_5 = block_rdd(data, block_size=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))

        block_data_10 = block_rdd(data, block_size=10)
        blocks = block_data_10.collect()
        assert_true(all(len(b) <= 10 for b in blocks))
Пример #3
0
 def test_cov(self):
     rng = np.random.RandomState(42)
     true_cov = np.array([[3., 2., 4.], [2., 2., 5.], [4., 5., 6.]])
     mat = rng.multivariate_normal(np.array([1., 2., 3.]), size=int(1e3),
         cov=true_cov)
     data = block_rdd(self.sc.parallelize(mat, 4))
     rdd_cov = cov(data)
     assert_array_almost_equal(np.cov(mat.T), rdd_cov, decimal=1)
Пример #4
0
 def test_cov(self):
     rng = np.random.RandomState(42)
     true_cov = np.array([[3., 2., 4.], [2., 2., 5.], [4., 5., 6.]])
     mat = rng.multivariate_normal(np.array([1., 2., 3.]),
                                   size=int(1e3),
                                   cov=true_cov)
     data = block_rdd(self.sc.parallelize(mat, 4))
     rdd_cov = cov(data)
     assert_array_almost_equal(np.cov(mat.T), rdd_cov, decimal=1)
Пример #5
0
 def test_svd(self):
     rng = np.random.RandomState(42)
     mat = rng.randn(1e3, 10)
     data = block_rdd(self.sc.parallelize(list(mat), 10))
     u, s, v = svd(data, 1)
     u = np.squeeze(np.concatenate(np.array(u.collect()))).T
     u_true, s_true, v_true = ln.svd(mat)
     assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]))
     assert_array_almost_equal(s[0], s_true[0])
     assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
Пример #6
0
 def test_svd(self):
     rng = np.random.RandomState(42)
     mat = rng.randn(1e3, 10)
     data = block_rdd(self.sc.parallelize(list(mat), 10))
     u, s, v = svd(data, 1)
     u = np.squeeze(np.concatenate(np.array(u.collect()))).T
     u_true, s_true, v_true = ln.svd(mat)
     assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]))
     assert_array_almost_equal(s[0], s_true[0])
     assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
Пример #7
0
    def test_block_rdd_dict(self):
        n_partitions = 3
        n_samples = 57
        dicts = [{'a': i, 'b': float(i)**2} for i in range(n_samples)]
        data = self.sc.parallelize(dicts, n_partitions)

        block_data_5 = block_rdd(data, block_size=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))
        assert_array_almost_equal(blocks[0].a, np.arange(5))
        assert_array_almost_equal(blocks[0].b, np.arange(5, dtype=np.float)**2)
Пример #8
0
 def test_svd_em(self):
     rng = np.random.RandomState(42)
     mat = rng.randn(10, 3)
     data = block_rdd(self.sc.parallelize(list(mat), 2)).cache()
     u, s, v = svd_em(data, 1, seed=42)
     u = np.squeeze(np.concatenate(np.array(u.collect()))).T
     u_true, s_true, v_true = ln.svd(mat)
     tol = 1
     assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]), tol)
     assert_array_almost_equal(s[0], s_true[0], tol)
     assert_array_almost_equal(u, match_sign(u, u_true[:, 0]), tol)
Пример #9
0
 def test_svd_em(self):
     rng = np.random.RandomState(42)
     mat = rng.randn(10, 3)
     data = block_rdd(self.sc.parallelize(list(mat), 2)).cache()
     u, s, v = svd_em(data, 1, seed=42)
     u = np.squeeze(np.concatenate(np.array(u.collect()))).T
     u_true, s_true, v_true = ln.svd(mat)
     tol = 1
     assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]), tol)
     assert_array_almost_equal(s[0], s_true[0], tol)
     assert_array_almost_equal(u, match_sign(u, u_true[:, 0]), tol)
Пример #10
0
    def test_block_rdd_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
            n_partitions)
        blocked_data = block_rdd(data)
        assert_array_almost_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_almost_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks),  n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
            n_partitions)
        blocked_data = block_rdd(data)
        assert_array_almost_equal(np.ones((n_samples / n_partitions, 1)),
                                  blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks),  n_samples)
Пример #11
0
    def test_block_rdd_sp_matrix(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([sparse_row for i in range(n_samples)],
            n_partitions)
        blocked_data = block_rdd(data)
        assert_true(sp.issparse(blocked_data.first()))

        expected_block = sp.vstack([sparse_row] * 10)
        assert_array_almost_equal(expected_block.toarray(),
                                  blocked_data.first().toarray())
Пример #12
0
    def test_block_rdd_sp_matrix(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([sparse_row for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block_rdd(data)
        assert_true(sp.issparse(blocked_data.first()))

        expected_block = sp.vstack([sparse_row] * 10)
        assert_array_almost_equal(expected_block.toarray(),
                                  blocked_data.first().toarray())
Пример #13
0
    def test_block_rdd_dict(self):
        n_partitions = 3
        n_samples = 57
        dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)]
        data = self.sc.parallelize(dicts, n_partitions)

        block_data_5 = block_rdd(data, block_size=5)
        blocks = block_data_5.collect()
        assert_true(all(len(b) <= 5 for b in blocks))
        assert_array_almost_equal(blocks[0].a, np.arange(5))
        assert_array_almost_equal(blocks[0].b,
                                  np.arange(5, dtype=np.float) ** 2)
Пример #14
0
    def test_block_rdd_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block_rdd(data)
        assert_array_almost_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_almost_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block_rdd(data)
        assert_array_almost_equal(np.ones((n_samples / n_partitions, 1)),
                                  blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
Пример #15
0
def train(matrix, featureSize, labels):

	diSmatrix = sc.parallelize(list(matrix), 10)

	#use spyleanr to parallelize SVD on RDD
	data = block_rdd(diSmatrix)
	u, s, v = svd(data, 100)

	print v.shape
	#Old unparallelized version
	# cov_mat = numpy.cov(matrix.T)
	# print cov_mat.shape
	# eig_val_cov, eig_vec_cov = numpy.linalg.eig(cov_mat)
	# # Make a list of (eigenvalue, eigenvector) tuples
	# eig_pairs = [(numpy.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))]

	# # Sort the (eigenvalue, eigenvector) tuples from high to low
	# eig_pairs.sort()
	# eig_pairs.reverse()

	# matrix_w = eig_pairs[0][1].reshape(featureSize,1)
	# for i in range(200):
	#     matrix_w = numpy.hstack((matrix_w, eig_pairs[i+1][1].reshape(featureSize,1)))
	# print matrix_w.shape

	transformed = matrix.dot(v.T)
	print transformed.shape
	#Compute cov matrix
	# if os.path.isfile('svm.model'):
	#     print 'Loading Model file...'
	#     #Load models from file
	#     # with open('svm.model', 'rb') as file:
	#     #     Z = pickle.load(file)
	# else:
		#Start to train SVM
	Z = OneVsRestClassifier(SVC(kernel="rbf")).fit(transformed, labels)
	    # with open('svm.model', 'wb') as file:
	    #     pickle.dump(Z, file)

	Z = Z.predict(transformed)

	print Z[0]
	correct = 0.0
	for x in range(len(Z)):
		if labels[x] == Z[x]:
			correct = correct +1

	print correct/len(Z)

	print 'plot reconstructed data'
	recData = transformed.dot(v.T) + matrix.mean(axis=1)[:, None]
	plot(recData[0].reshape((32,32)))
Пример #16
0
 def setUp(self):
     super(LinearModelTestCase, self).setUp()
     if self.data is None:
         rng = np.random.RandomState(42)
         X = rng.normal(size=(int(1e3), 50))
         coef = rng.normal(size=50)
         y = (np.dot(X, coef) > 0.01).astype(np.int)
         self.X = X
         self.y = y
         self.classes = np.unique(y)
         self.data = self.sc.parallelize(list(zip(X, y)),
             numSlices=self.n_partitions).cache()
         self.blocked_data = block_rdd(self.data, block_size=171)
Пример #17
0
 def setUp(self):
     super(LinearModelTestCase, self).setUp()
     if self.data is None:
         rng = np.random.RandomState(42)
         X = rng.normal(size=(int(1e3), 50))
         coef = rng.normal(size=50)
         y = (np.dot(X, coef) > 0.01).astype(np.int)
         self.X = X
         self.y = y
         self.classes = np.unique(y)
         self.data = self.sc.parallelize(
             list(zip(X, y)), numSlices=self.n_partitions).cache()
         self.blocked_data = block_rdd(self.data, block_size=171)
Пример #18
0
    def test_block_rdd_tuple(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([(np.array([1., 2.]), 0, sparse_row)
                                    for i in range(n_samples)], n_partitions)
        blocked_data = block_rdd(data)

        expected_first_block = np.array([[1., 2.]] * 10)
        expected_second_block = np.zeros(10, dtype=np.int)
        expected_third_block = sp.vstack([sparse_row] * 10)

        first_block_tuple = blocked_data.first()
        assert_array_almost_equal(expected_first_block, first_block_tuple[0])
        assert_array_almost_equal(expected_second_block, first_block_tuple[1])
        assert_array_almost_equal(expected_third_block.toarray(),
                                  first_block_tuple[2].toarray())

        tuple_blocks = blocked_data.collect()
        assert_equal(len(tuple_blocks), n_partitions)
        assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples)
        assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
Пример #19
0
    def test_block_rdd_tuple(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize(
            [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)],
            n_partitions)
        blocked_data = block_rdd(data)

        expected_first_block = np.array([[1., 2.]] * 10)
        expected_second_block = np.zeros(10, dtype=np.int)
        expected_third_block = sp.vstack([sparse_row] * 10)

        first_block_tuple = blocked_data.first()
        assert_array_almost_equal(expected_first_block, first_block_tuple[0])
        assert_array_almost_equal(expected_second_block, first_block_tuple[1])
        assert_array_almost_equal(expected_third_block.toarray(),
                                  first_block_tuple[2].toarray())

        tuple_blocks = blocked_data.collect()
        assert_equal(len(tuple_blocks), n_partitions)
        assert_equal(sum(len(b[0]) for b in tuple_blocks),  n_samples)
        assert_equal(sum(len(b[1]) for b in tuple_blocks),  n_samples)
Пример #20
0
 def test_count(self):
     n_samples = 100
     n_partitions = 10
     mat = [np.array([1]) for i in range(n_samples)]
     data = block_rdd(self.sc.parallelize(mat, n_partitions))
     assert_array_almost_equal(n_samples, count(data))
Пример #21
0
 def test_block_empty_rdd(self):
     n_partitions = 3
     empty_data = self.sc.parallelize([], n_partitions)
     blocks = block_rdd(empty_data).collect()
     assert_equal(len(blocks), 0)
Пример #22
0
 def test_block_empty_rdd(self):
     n_partitions = 3
     empty_data = self.sc.parallelize([], n_partitions)
     blocks = block_rdd(empty_data).collect()
     assert_equal(len(blocks), 0)
Пример #23
0
 def test_count(self):
     n_samples = 100
     n_partitions = 10
     mat = [np.array([1]) for i in range(n_samples)]
     data = block_rdd(self.sc.parallelize(mat, n_partitions))
     assert_array_almost_equal(n_samples, count(data))