def test_same_prediction(self): X, y, Z = self.make_classification(4, 100000, nonnegative=True) local = MultinomialNB() dist = SparkMultinomialNB() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X']) assert_array_almost_equal(y_local, np.concatenate(y_dist.collect()))
def test_same_coefs(self): X, y, Z = self.make_classification(2, 10000) local = LogisticRegression(tol=1e-4, C=10) dist = SparkLogisticRegression(tol=1e-4, C=10) local.fit(X, y) dist.fit(Z, classes=np.unique(y)) assert_array_almost_equal(local.coef_, dist.coef_, decimal=1)
def test_same_coefs(self): X, y, Z = self.make_classification(2, 100000) local = LinearSVC() dist = SparkLinearSVC() local.fit(X, y) dist.fit(Z, classes=np.unique(y)) assert_array_almost_equal(local.coef_, dist.coef_, decimal=3)
def test_same_coefs(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() local.fit(X, y) dist.fit(Z) assert_array_almost_equal(local.coef_, dist.coef_) assert_array_almost_equal(local.intercept_, dist.intercept_)
def test_same_prediction(self): X, y, Z = self.make_classification(4, 100000, nonnegative=True) local = MultinomialNB() dist = SparkMultinomialNB() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray, ))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_same_prediction(self): X, y, Z = self.make_regression(1, 100000) local = LinearRegression() dist = SparkLinearRegression() y_local = local.fit(X, y).predict(X) y_dist = dist.fit(Z).predict(Z[:, 'X']) assert_true(check_rdd_dtype(y_dist, (np.ndarray,))) assert_array_almost_equal(y_local, y_dist.toarray())
def test_sparse_matrix(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize([sparse_row for i in range(n_samples)], n_partitions) blocked_data = block(data) assert_true(sp.issparse(blocked_data.first())) expected_block = sp.vstack([sparse_row] * 10) assert_array_almost_equal(expected_block.toarray(), blocked_data.first().toarray())
def test_same_prediction(self): X, y, Z = self.make_classification(2, 800000, nonnegative=True) local = GaussianNB() dist = SparkGaussianNB() local_model = local.fit(X, y) dist_model = dist.fit(Z, classes=np.unique(y)) # TODO: investigate the variance further! assert_array_almost_equal(local_model.sigma_, dist_model.sigma_, 2) assert_array_almost_equal(local_model.theta_, dist_model.theta_, 6)
def test_block_rdd_dict(self): n_partitions = 3 n_samples = 57 dicts = [{'a': i, 'b': float(i)**2} for i in range(n_samples)] data = self.sc.parallelize(dicts, n_partitions) block_data_5 = block(data, bsize=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) assert_array_almost_equal(blocks[0][0], np.arange(5)) assert_array_almost_equal(blocks[0][1], np.arange(5, dtype=np.float)**2)
def _test_func_on_axis(self, func, toarray=True): X, X_rdd = self.make_sparse_rdd(block_size=100) assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)()) for axes in (0, 1): if toarray: assert_array_almost_equal( getattr(X_rdd, func)(axis=axes).toarray(), getattr(X, func)(axis=axes).toarray()) else: assert_array_almost_equal( getattr(X_rdd, func)(axis=axes), getattr(X, func)(axis=axes))
def test_block_rdd_dict(self): n_partitions = 3 n_samples = 57 dicts = [{'a': i, 'b': float(i) ** 2} for i in range(n_samples)] data = self.sc.parallelize(dicts, n_partitions) block_data_5 = block(data, bsize=5) blocks = block_data_5.collect() assert_true(all(len(b) <= 5 for b in blocks)) assert_array_almost_equal(blocks[0][0], np.arange(5)) assert_array_almost_equal(blocks[0][1], np.arange(5, dtype=np.float) ** 2)
def _test_func_on_axis(self, func): X, X_rdd = self.make_dense_rdd(block_size=100) assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)()) for axes in (0, 1): assert_array_almost_equal(getattr(X_rdd, func)(axis=axes), getattr(X, func)(axis=axes)) X, X_rdd = self.make_dense_rdd((100, 3, 2)) assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)()) for axes in (0, 1, 2): assert_array_almost_equal(getattr(X_rdd, func)(axis=axes), getattr(X, func)(axis=axes))
def test_same_transform_result(self): X, y, Z_rdd = self.make_classification(4, 1000, -1) X_rdd = Z_rdd[:, 'X'] local = TfidfTransformer() dist = SparkTfidfTransformer() Z_local = local.fit_transform(X) Z_dist = dist.fit_transform(X_rdd) assert_true(check_rdd_dtype(Z_dist, sp.spmatrix)) assert_array_almost_equal(Z_local.toarray(), Z_dist.toarray())
def test_same_transform_result(self): X, y, Z_rdd = self.make_classification(4, 1000, None) X_rdd = Z_rdd[:, 'X'] local = TfidfTransformer() dist = SparkTfidfTransformer() Z_local = local.fit_transform(X) Z_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_array_almost_equal(Z_local.toarray(), Z_dist.toarray())
def test_same_centroids(self): X, y, X_rdd = self.make_blobs(centers=4, n_samples=200000) local = KMeans(n_clusters=4, init='k-means++', random_state=42) dist = SparkKMeans(n_clusters=4, init='k-means++', random_state=42) local.fit(X) dist.fit(X_rdd) local_centers = np.sort(local.cluster_centers_, axis=0) dist_centers = np.sort(dist.cluster_centers_, axis=0) assert_array_almost_equal(local_centers, dist_centers, decimal=4)
def _test_func_on_axis(self, func): X, X_rdd = self.make_dense_rdd(block_size=100) assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)()) for axes in (0, 1): assert_array_almost_equal( getattr(X_rdd, func)(axis=axes), getattr(X, func)(axis=axes)) X, X_rdd = self.make_dense_rdd((100, 3, 2)) assert_almost_equal(getattr(X_rdd, func)(), getattr(X, func)()) for axes in (0, 1, 2): assert_array_almost_equal( getattr(X_rdd, func)(axis=axes), getattr(X, func)(axis=axes))
def test_svd(self): X, X_rdd = self.make_dense_rdd() u, s, v = svd(X_rdd, 1) u = np.squeeze(np.concatenate(np.array(u.collect()))).T u_true, s_true, v_true = ln.svd(X) assert_array_almost_equal(v[0], match_sign(v[0], v_true[0, :])) assert_array_almost_equal(s[0], s_true[0]) assert_array_almost_equal(u, match_sign(u, u_true[:, 0]))
def test_block_rdd_tuple(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize([(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], n_partitions) blocked_data = block(data) expected_first_block = np.array([[1., 2.]] * 10) expected_second_block = np.zeros(10, dtype=np.int) expected_third_block = sp.vstack([sparse_row] * 10) first_block_tuple = blocked_data.first() assert_array_almost_equal(expected_first_block, first_block_tuple[0]) assert_array_almost_equal(expected_second_block, first_block_tuple[1]) assert_array_almost_equal(expected_third_block.toarray(), first_block_tuple[2].toarray()) tuple_blocks = blocked_data.collect() assert_equal(len(tuple_blocks), n_partitions) assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
def test_block_rdd_tuple(self): n_partitions = 10 n_samples = 100 sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]]) data = self.sc.parallelize( [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)], n_partitions) blocked_data = block(data) expected_first_block = np.array([[1., 2.]] * 10) expected_second_block = np.zeros(10, dtype=np.int) expected_third_block = sp.vstack([sparse_row] * 10) first_block_tuple = blocked_data.first() assert_array_almost_equal(expected_first_block, first_block_tuple[0]) assert_array_almost_equal(expected_second_block, first_block_tuple[1]) assert_array_almost_equal(expected_third_block.toarray(), first_block_tuple[2].toarray()) tuple_blocks = blocked_data.collect() assert_equal(len(tuple_blocks), n_partitions) assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples) assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
def test_dot(self): A, A_rdd = self.make_sparse_rdd((20, 10)) B, B_rdd = self.make_sparse_rdd((10, 20)) assert_array_almost_equal(A_rdd.dot(B).toarray(), A.dot(B).toarray()) assert_array_almost_equal(B_rdd.dot(A).toarray(), B.dot(A).toarray())
def test_dot(self): a = np.arange(200).reshape(20, 10) b = np.arange(200).reshape(10, 20) a_rdd = ArrayRDD(self.sc.parallelize(a)) assert_array_almost_equal(unpack(a_rdd.dot(b)), a.dot(b))
def test_dot_sparse(self): a, a_rdd = self.generate_sparse_dataset(shape=(10, 20)) b = sp.rand(20, 10, random_state=2, density=0.1) assert_array_almost_equal(unpack(a_rdd.dot(b)).toarray(), a.dot(b).toarray())
def test_mean_sparse(self): data, rdd = self.generate_sparse_dataset() assert_almost_equal(ArrayRDD(rdd).mean(), data.mean()) assert_array_almost_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0)) assert_array_almost_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))