def test_block_rdd_array_block_size(self): n_partitions = 10 n_samples = 107 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) block_data_5 = block_rdd(data, block_size=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) block_data_10 = block_rdd(data, block_size=10) blocks = block_data_10.collect() assert_true(all(len(b) <= 10 for b in blocks))
def test_cov(self): rng = np.random.RandomState(42) true_cov = np.array([[3., 2., 4.], [2., 2., 5.], [4., 5., 6.]]) mat = rng.multivariate_normal(np.array([1., 2., 3.]), size=int(1e3), cov=true_cov) data = block_rdd(self.sc.parallelize(mat, 4)) rdd_cov = cov(data) assert_array_almost_equal(np.cov(mat.T), rdd_cov, decimal=1)
def test_svd(self): rng = np.random.RandomState(42) mat = rng.randn(1e3, 10) data = block_rdd(self.sc.parallelize(list(mat), 10)) u, s, v = svd(data, 1) u = np.squeeze(np.concatenate(np.array(u.collect()))).T u_true, s_true, v_true = ln.svd(mat) assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :])) assert_array_almost_equal(s[0], s_true[0]) assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
def test_block_rdd_dict(self): n_partitions = 3 n_samples = 57 dicts = [{'a': i, 'b': float(i)**2} for i in range(n_samples)] data = self.sc.parallelize(dicts, n_partitions) block_data_5 = block_rdd(data, block_size=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) assert_array_almost_equal(blocks[0].a, np.arange(5)) assert_array_almost_equal(blocks[0].b, np.arange(5, dtype=np.float)**2)
def test_svd_em(self): rng = np.random.RandomState(42) mat = rng.randn(10, 3) data = block_rdd(self.sc.parallelize(list(mat), 2)).cache() u, s, v = svd_em(data, 1, seed=42) u = np.squeeze(np.concatenate(np.array(u.collect()))).T u_true, s_true, v_true = ln.svd(mat) tol = 1 assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :]), tol) assert_array_almost_equal(s[0], s_true[0], tol) assert_array_almost_equal(u, match_sign(u, u_true[:, 0]), tol)
def test_block_rdd_array(self): n_partitions = 10 n_samples = 100 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block_rdd(data) assert_array_almost_equal(np.ones((10, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_array_almost_equal(np.ones((10, 1)), blocks[-1]) assert_equal(sum(len(b) for b in blocks), n_samples) n_partitions = 17 data = self.sc.parallelize([np.array([1]) for i in range(n_samples)], n_partitions) blocked_data = block_rdd(data) assert_array_almost_equal(np.ones((n_samples / n_partitions, 1)), blocked_data.first()) blocks = blocked_data.collect() assert_equal(len(blocks), n_partitions) assert_equal(sum(len(b) for b in blocks), n_samples)
def test_block_rdd_sp_matrix(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize([sparse_row for i in range(n_samples)], n_partitions) blocked_data = block_rdd(data) assert_true(sp.issparse(blocked_data.first())) expected_block = sp.vstack([sparse_row] * 10) assert_array_almost_equal(expected_block.toarray(), blocked_data.first().toarray())
def test_block_rdd_dict(self): n_partitions = 3 n_samples = 57 dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)] data = self.sc.parallelize(dicts, n_partitions) block_data_5 = block_rdd(data, block_size=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) assert_array_almost_equal(blocks[0].a, np.arange(5)) assert_array_almost_equal(blocks[0].b, np.arange(5, dtype=np.float) ** 2)
def train(matrix, featureSize, labels): diSmatrix = sc.parallelize(list(matrix), 10) #use spyleanr to parallelize SVD on RDD data = block_rdd(diSmatrix) u, s, v = svd(data, 100) print v.shape #Old unparallelized version # cov_mat = numpy.cov(matrix.T) # print cov_mat.shape # eig_val_cov, eig_vec_cov = numpy.linalg.eig(cov_mat) # # Make a list of (eigenvalue, eigenvector) tuples # eig_pairs = [(numpy.abs(eig_val_cov[i]), eig_vec_cov[:,i]) for i in range(len(eig_val_cov))] # # Sort the (eigenvalue, eigenvector) tuples from high to low # eig_pairs.sort() # eig_pairs.reverse() # matrix_w = eig_pairs[0][1].reshape(featureSize,1) # for i in range(200): # matrix_w = numpy.hstack((matrix_w, eig_pairs[i+1][1].reshape(featureSize,1))) # print matrix_w.shape transformed = matrix.dot(v.T) print transformed.shape #Compute cov matrix # if os.path.isfile('svm.model'): # print 'Loading Model file...' # #Load models from file # # with open('svm.model', 'rb') as file: # # Z = pickle.load(file) # else: #Start to train SVM Z = OneVsRestClassifier(SVC(kernel="rbf")).fit(transformed, labels) # with open('svm.model', 'wb') as file: # pickle.dump(Z, file) Z = Z.predict(transformed) print Z[0] correct = 0.0 for x in range(len(Z)): if labels[x] == Z[x]: correct = correct +1 print correct/len(Z) print 'plot reconstructed data' recData = transformed.dot(v.T) + matrix.mean(axis=1)[:, None] plot(recData[0].reshape((32,32)))
def setUp(self): super(LinearModelTestCase, self).setUp() if self.data is None: rng = np.random.RandomState(42) X = rng.normal(size=(int(1e3), 50)) coef = rng.normal(size=50) y = (np.dot(X, coef) > 0.01).astype(np.int) self.X = X self.y = y self.classes = np.unique(y) self.data = self.sc.parallelize(list(zip(X, y)), numSlices=self.n_partitions).cache() self.blocked_data = block_rdd(self.data, block_size=171)
def setUp(self): super(LinearModelTestCase, self).setUp() if self.data is None: rng = np.random.RandomState(42) X = rng.normal(size=(int(1e3), 50)) coef = rng.normal(size=50) y = (np.dot(X, coef) > 0.01).astype(np.int) self.X = X self.y = y self.classes = np.unique(y) self.data = self.sc.parallelize( list(zip(X, y)), numSlices=self.n_partitions).cache() self.blocked_data = block_rdd(self.data, block_size=171)
def test_block_rdd_tuple(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize([(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], n_partitions) blocked_data = block_rdd(data) expected_first_block = np.array([[1., 2.]] * 10) expected_second_block = np.zeros(10, dtype=np.int) expected_third_block = sp.vstack([sparse_row] * 10) first_block_tuple = blocked_data.first() assert_array_almost_equal(expected_first_block, first_block_tuple[0]) assert_array_almost_equal(expected_second_block, first_block_tuple[1]) assert_array_almost_equal(expected_third_block.toarray(), first_block_tuple[2].toarray()) tuple_blocks = blocked_data.collect() assert_equal(len(tuple_blocks), n_partitions) assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
def test_block_rdd_tuple(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize( [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], n_partitions) blocked_data = block_rdd(data) expected_first_block = np.array([[1., 2.]] * 10) expected_second_block = np.zeros(10, dtype=np.int) expected_third_block = sp.vstack([sparse_row] * 10) first_block_tuple = blocked_data.first() assert_array_almost_equal(expected_first_block, first_block_tuple[0]) assert_array_almost_equal(expected_second_block, first_block_tuple[1]) assert_array_almost_equal(expected_third_block.toarray(), first_block_tuple[2].toarray()) tuple_blocks = blocked_data.collect() assert_equal(len(tuple_blocks), n_partitions) assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
def test_count(self): n_samples = 100 n_partitions = 10 mat = [np.array([1]) for i in range(n_samples)] data = block_rdd(self.sc.parallelize(mat, n_partitions)) assert_array_almost_equal(n_samples, count(data))
def test_block_empty_rdd(self): n_partitions = 3 empty_data = self.sc.parallelize([], n_partitions) blocks = block_rdd(empty_data).collect() assert_equal(len(blocks), 0)