예제 #1
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
예제 #2
0
 def test_unblocking_rdd(self):
     data = np.arange(400)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_unblocked = X.unblock()
     assert_is_instance(X_unblocked, RDD)
     assert_array_equal(X_unblocked.take(12), np.arange(12).tolist())
예제 #3
0
 def test_convert_toiter(self):
     data = np.arange(40)
     rdd = self.sc.parallelize(data, 4)
     X = ArrayRDD(rdd, 5)
     X_iter = X.toiter()
     assert_is_instance(X_iter, collections.Iterator)
     assert_array_equal(list(X_iter), X.collect())
예제 #4
0
 def test_size(self):
     data = np.arange(4000)
     shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         size = ArrayRDD(rdd).map(lambda x: x.size).sum()
         assert_equal(size, reshaped.size)
         assert_equal(ArrayRDD(rdd).size, reshaped.size)
예제 #5
0
    def test_transform(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        fn = lambda x: x ** 2
        X1 = map(fn, X.collect())
        X2 = X.transform(fn).collect()

        assert_array_equal(X1, X2)
예제 #6
0
    def test_convert_toarray(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_array = X.toarray()
        assert_array_equal(X_array, data)

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_array = X.toarray()
        assert_array_equal(X_array, np.array(data))
예제 #7
0
    def make_regression(self, n_targets, n_samples, blocks=-1):
        X, y = make_regression(n_targets=n_targets,
                               n_samples=n_samples,
                               n_features=20,
                               n_informative=10,
                               random_state=42)

        X_rdd = ArrayRDD(self.sc.parallelize(X))
        y_rdd = ArrayRDD(self.sc.parallelize(y))
        Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks)

        return X, y, Z
예제 #8
0
    def test_convert_tolist(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data.tolist())

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data)
예제 #9
0
    def test_get_single_item(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        expected = np.arange(0, 20).reshape((5, 4))
        assert_array_equal(X.first(), expected)
        assert_array_equal(X[0].first(), expected)
        assert_array_equal(X.ix(0).first(), expected)

        expected = np.arange(20, 40).reshape((5, 4))
        assert_array_equal(X[1].first(), expected)
        assert_array_equal(X.ix(1).first(), expected)

        expected = np.arange(380, 400).reshape((5, 4))
        assert_array_equal(X[19].first(), expected)
        assert_array_equal(X.ix(19).first(), expected)
        assert_array_equal(X[-1].first(), expected)
        assert_array_equal(X.ix(-1).first(), expected)

        expected = np.arange(340, 360).reshape((5, 4))
        assert_array_equal(X[17].first(), expected)
        assert_array_equal(X.ix(17).first(), expected)
        assert_array_equal(X[-3].first(), expected)
        assert_array_equal(X.ix(-3).first(), expected)
예제 #10
0
    def test_initialization(self):
        n_partitions = 4
        n_samples = 100

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_raises(TypeError, ArrayRDD, data)
        assert_raises(TypeError, ArrayRDD, data, False)
        assert_raises(TypeError, ArrayRDD, data, 10)

        assert_is_instance(ArrayRDD(rdd), ArrayRDD)
        assert_is_instance(ArrayRDD(rdd, 10), ArrayRDD)
        assert_is_instance(ArrayRDD(rdd, None), ArrayRDD)
 def make_dict_dataset(self, blocks=-1):
     X = [{
         "foo": 1,
         "bar": 3
     }, {
         "bar": 4,
         "baz": 2
     }, {
         "bar": 6,
         "baz": 1
     }, {
         "bar": 4,
         "ewo": "ok"
     }, {
         "bar": 4,
         "baz": 2
     }, {
         "bar": 9,
         "ewo": "fail"
     }, {
         "bar": 4,
         "baz": 2
     }, {
         "bar": 1,
         "quux": 1,
         "quuux": 2
     }]
     X_rdd = ArrayRDD(self.sc.parallelize(X, 4), blocks)
     return X, X_rdd
예제 #12
0
 def test_ndim(self):
     data = np.arange(4000)
     shapes = [(4000), (1000, 4), (200, 10, 2), (100, 10, 2, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         assert_equal(ArrayRDD(rdd).ndim, reshaped.ndim)
예제 #13
0
 def test_shape(self):
     data = np.arange(4000)
     shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         assert_equal(ArrayRDD(rdd).shape, shape)
예제 #14
0
    def test_blocks_size(self):
        n_partitions = 10
        n_samples = 1000

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        shapes = ArrayRDD(rdd).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 100))
        shapes = ArrayRDD(rdd, 5).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 5))
        shapes = ArrayRDD(rdd, 50).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 50))
        shapes = ArrayRDD(rdd, 250).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.array(shapes) == 100))
        shapes = ArrayRDD(rdd, 66).map(lambda x: x.shape[0]).collect()
        assert_true(all(np.in1d(shapes, [66, 34])))
예제 #15
0
 def make_dense_randint_rdd(self,
                            low,
                            high=None,
                            shape=(1e3, 10),
                            block_size=-1):
     X = np.random.randint(low, high, size=shape)
     X_rdd = ArrayRDD(self.sc.parallelize(X, 4), bsize=block_size)
     return X, X_rdd
예제 #16
0
    def test_creation_from_blocked_rdds(self):
        x = np.arange(80).reshape((40, 2))
        y = np.arange(40)
        z = list(range(40))
        x_rdd = ArrayRDD(self.sc.parallelize(x, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        z_rdd = BlockRDD(self.sc.parallelize(z, 4), dtype=list)

        expected = (np.arange(20).reshape(10,
                                          2), np.arange(10), list(range(10)))
        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        rdd = DictRDD([x_rdd, y_rdd, z_rdd], dtype=(None, None, list))
        first = rdd.first()
        assert_tuple_equal(first, expected)
        assert_is_instance(first[2], list)
예제 #17
0
    def test_get_single_item(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)

        expected = np.arange(0, 20).reshape((5, 4))
        assert_array_equal(X.first(), expected)
        assert_array_equal(X[0].first(), expected)

        expected = np.arange(20, 40).reshape((5, 4))
        assert_array_equal(X[1].first(), expected)

        expected = np.arange(380, 400).reshape((5, 4))
        assert_array_equal(X[19].first(), expected)
        assert_array_equal(X[-1].first(), expected)

        expected = np.arange(340, 360).reshape((5, 4))
        assert_array_equal(X[17].first(), expected)
        assert_array_equal(X[-3].first(), expected)
예제 #18
0
    def make_classification(self,
                            n_classes,
                            n_samples,
                            blocks=-1,
                            nonnegative=False):
        X, y = make_classification(n_classes=n_classes,
                                   n_samples=n_samples,
                                   n_features=5,
                                   n_informative=4,
                                   n_redundant=0,
                                   n_clusters_per_class=1,
                                   random_state=42)
        if nonnegative:
            X = np.abs(X)

        X_rdd = ArrayRDD(self.sc.parallelize(X, 4))
        y_rdd = ArrayRDD(self.sc.parallelize(y, 4))
        Z = DictRDD([X_rdd, y_rdd], columns=('X', 'y'), bsize=blocks)

        return X, y, Z
예제 #19
0
    def test_blocks_number(self):
        n_partitions = 10
        n_samples = 1000

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_equal(1000, ArrayRDD(rdd, noblock=True, bsize=1).blocks)
        assert_equal(10, ArrayRDD(rdd).blocks)
        assert_equal(20, ArrayRDD(rdd, 50).blocks)
        assert_equal(20, ArrayRDD(rdd, 66).blocks)
        assert_equal(10, ArrayRDD(rdd, 100).blocks)
        assert_equal(10, ArrayRDD(rdd, 300).blocks)
        assert_equal(200, ArrayRDD(rdd, 5).blocks)
        assert_equal(100, ArrayRDD(rdd, 10).blocks)
예제 #20
0
    def test_convert_toarray(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_array = X.toarray()
        assert_array_equal(X_array, data)

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_array = X.toarray()
        assert_array_equal(X_array, np.array(data))
예제 #21
0
    def test_partitions_number(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        assert_equal(ArrayRDD(rdd, 5).partitions, 4)
        assert_equal(ArrayRDD(rdd, 10).partitions, 4)
        assert_equal(ArrayRDD(rdd, 20).partitions, 4)

        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 7)
        assert_equal(ArrayRDD(rdd, 5).partitions, 7)
        assert_equal(ArrayRDD(rdd, 10).partitions, 7)
        assert_equal(ArrayRDD(rdd, 20).partitions, 7)
예제 #22
0
    def test_convert_tolist(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data.tolist())

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data)
예제 #23
0
 def make_dense_range_rdd(self, shape=(1e3, 10), block_size=-1):
     X = np.arange(np.prod(shape)).reshape(shape)
     X_rdd = ArrayRDD(self.sc.parallelize(X, 4), bsize=block_size)
     return X, X_rdd
예제 #24
0
 def make_text_rdd(self, blocks=-1):
     X = ALL_FOOD_DOCS
     X_rdd = ArrayRDD(self.sc.parallelize(X, 4), blocks)
     return X, X_rdd
예제 #25
0
 def make_dense_rdd(self, shape=(1e3, 10), block_size=-1):
     rng = np.random.RandomState(2)
     X = rng.randn(*shape)
     X_rdd = ArrayRDD(self.sc.parallelize(X, 4), bsize=block_size)
     return X, X_rdd
예제 #26
0
from splearn.grid_search import SparkGridSearchCV

#data preprocess
df = pd.read_csv("review.csv", header=None, encoding='latin1')
df[0] = df[0].apply(lambda death: 0 if death <= 5 else 1)
df = df.dropna()
data = df[1]
target = df[0]
list = []
data_train, data_test, target_train, target_test = cross_validation.train_test_split(
    data, target, test_size=0.25, random_state=43)

# train data toRDD
train_x = sc.parallelize(data_train)
train_y = sc.parallelize(target_train)
train_x = ArrayRDD(train_x)
train_y = ArrayRDD(train_y)
Z = DictRDD((train_x, train_y),
            columns=('X', 'y'),
            dtype=[np.ndarray, np.ndarray])

# pipeline
dist_pipeline = SparkPipeline((
    ('vect', SparkHashingVectorizer(non_negative=True)),  # hashingTF for NB
    ('tfidf', SparkTfidfTransformer()),  # IDF
    ('clf', SparkMultinomialNB(alpha=0.05))  # NB
))

# fit
dist_pipeline.fit(Z, clf__classes=np.array([0, 1]))
예제 #27
0
 def make_blobs(self, centers, n_samples, blocks=-1):
     X, y = make_blobs(n_samples=n_samples,
                       centers=centers,
                       random_state=42)
     X_rdd = ArrayRDD(self.sc.parallelize(X))
     return X, y, X_rdd
예제 #28
0
 def test_dot(self):
     a = np.arange(200).reshape(20, 10)
     b = np.arange(200).reshape(10, 20)
     a_rdd = ArrayRDD(self.sc.parallelize(a))
     assert_array_almost_equal(unpack(a_rdd.dot(b)), a.dot(b))