예제 #1
0
 def test_ndim(self):
     data = np.arange(4000)
     shapes = [(4000), (1000, 4), (200, 10, 2), (100, 10, 2, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         assert_equal(ArrayRDD(rdd).ndim, reshaped.ndim)
예제 #2
0
 def test_shape(self):
     data = np.arange(4000)
     shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         assert_equal(ArrayRDD(rdd).shape, shape)
예제 #3
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{
            'min_df': .5
        }, {
            'min_df': 2,
            'max_df': .9
        }, {
            'min_df': 1,
            'max_df': .6
        }, {
            'min_df': 2,
            'max_features': 3
        }]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X).toarray()
            result_dist = dist.fit_transform(X_rdd).toarray()

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local, result_dist)

            result_dist = dist.transform(X_rdd).toarray()
            assert_array_equal(result_local, result_dist)
예제 #4
0
    def test_auto_dtype(self):
        x = np.arange(80).reshape((40, 2))
        y = tuple(range(40))
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)),
                    list(range(10)))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple}))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(
            check_rdd_dtype(rdd, {
                'x': np.ndarray,
                'y': tuple,
                'z': tuple
            }))
예제 #5
0
 def test_shape(self):
     data = np.arange(4000)
     shapes = [(1000, 4),
               (200, 20),
               (100, 40),
               (2000, 2)]
     for shape in shapes:
         rdd = self.sc.parallelize(data.reshape(shape))
         assert_equal(ArrayRDD(rdd).shape, shape)
예제 #6
0
 def test_size(self):
     data = np.arange(4000)
     shapes = [(1000, 4), (200, 20), (100, 40), (2000, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         size = ArrayRDD(rdd).map(lambda x: x.size).sum()
         assert_equal(size, reshaped.size)
         assert_equal(ArrayRDD(rdd).size, reshaped.size)
예제 #7
0
    def test_unblock(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), list(range(1000)))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), list(range(1000)))
예제 #8
0
 def test_ndim(self):
     data = np.arange(4000)
     shapes = [(4000),
               (1000, 4),
               (200, 10, 2),
               (100, 10, 2, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         assert_equal(ArrayRDD(rdd).ndim, reshaped.ndim)
    def test_same_output(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer()
        dist = SparkDictVectorizer()

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
예제 #10
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
예제 #11
0
    def test_unblock(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), range(1000))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.unblock()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked.collect(), range(1000))
예제 #12
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = CountVectorizer()
        dist = SparkCountVectorizer()

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()

        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local, result_dist)
예제 #13
0
 def test_blocks_number(self):
     blocked = BlockRDD(self.generate(1000), bsize=50)
     assert_equal(blocked.blocks, 20)
     blocked = BlockRDD(self.generate(621), bsize=45)
     assert_equal(blocked.blocks, 20)
     blocked = BlockRDD(self.generate(100), bsize=4)
     assert_equal(blocked.blocks, 30)
     blocked = BlockRDD(self.generate(79, 2), bsize=9)
     assert_equal(blocked.blocks, 10)
     blocked = BlockRDD(self.generate(89, 2), bsize=5)
     assert_equal(blocked.blocks, 18)
예제 #14
0
 def test_length(self):
     blocked = BlockRDD(self.generate(1000))
     assert_equal(len(blocked), 1000)
     blocked = BlockRDD(self.generate(100))
     assert_equal(len(blocked), 100)
     blocked = BlockRDD(self.generate(79))
     assert_equal(len(blocked), 79)
     blocked = BlockRDD(self.generate(89))
     assert_equal(len(blocked), 89)
     blocked = BlockRDD(self.generate(62))
     assert_equal(len(blocked), 62)
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix, )))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_same_output_sparse(self):
        X, X_rdd = self.make_dict_dataset()
        local = DictVectorizer(sparse=True)
        dist = SparkDictVectorizer(sparse=True)

        result_local = local.fit_transform(X)
        result_dist = dist.fit_transform(X_rdd)

        assert_true(check_rdd_dtype(result_dist, (sp.spmatrix,)))
        assert_equal(local.vocabulary_, dist.vocabulary_)
        assert_array_equal(result_local.toarray(), result_dist.toarray())
예제 #17
0
 def test_length(self):
     blocked = BlockRDD(self.generate(1000))
     assert_equal(len(blocked), 1000)
     blocked = BlockRDD(self.generate(100))
     assert_equal(len(blocked), 100)
     blocked = BlockRDD(self.generate(79))
     assert_equal(len(blocked), 79)
     blocked = BlockRDD(self.generate(89))
     assert_equal(len(blocked), 89)
     blocked = BlockRDD(self.generate(62))
     assert_equal(len(blocked), 62)
예제 #18
0
 def test_blocks_number(self):
     blocked = BlockRDD(self.generate(1000), bsize=50)
     assert_equal(blocked.blocks, 20)
     blocked = BlockRDD(self.generate(621), bsize=45)
     assert_equal(blocked.blocks, 20)
     blocked = BlockRDD(self.generate(100), bsize=4)
     assert_equal(blocked.blocks, 30)
     blocked = BlockRDD(self.generate(79, 2), bsize=9)
     assert_equal(blocked.blocks, 10)
     blocked = BlockRDD(self.generate(89, 2), bsize=5)
     assert_equal(blocked.blocks, 18)
예제 #19
0
 def test_size(self):
     data = np.arange(4000)
     shapes = [(1000, 4),
               (200, 20),
               (100, 40),
               (2000, 2)]
     for shape in shapes:
         reshaped = data.reshape(shape)
         rdd = self.sc.parallelize(reshaped)
         size = ArrayRDD(rdd).map(lambda x: x.size).sum()
         assert_equal(size, reshaped.size)
         assert_equal(ArrayRDD(rdd).size, reshaped.size)
예제 #20
0
    def test_sum(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data)
        assert_equal(ArrayRDD(rdd).sum(), data.sum())
        assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0))
        assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1))

        data = np.arange(600).reshape((100, 3, 2))
        rdd = self.sc.parallelize(data)
        assert_equal(ArrayRDD(rdd).sum(), data.sum())
        assert_array_equal(ArrayRDD(rdd).sum(axis=0), data.sum(axis=0))
        assert_array_equal(ArrayRDD(rdd).sum(axis=1), data.sum(axis=1))
        assert_array_equal(ArrayRDD(rdd).sum(axis=2), data.sum(axis=2))
예제 #21
0
    def test_convert_tolist(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data.tolist())

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data)
예제 #22
0
    def test_convert_tolist(self):
        data = np.arange(400)
        rdd = self.sc.parallelize(data, 4)
        X = ArrayRDD(rdd, 5)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data.tolist())

        data = [2, 3, 5, 1, 6, 7, 9, 9]
        rdd = self.sc.parallelize(data, 2)
        X = ArrayRDD(rdd)
        X_list = X.tolist()
        assert_is_instance(X_list, list)
        assert_equal(X_list, data)
예제 #23
0
    def test_transform_with_dtype(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X2 = X.transform(lambda x: x ** 2, column=0)
        assert_equal(X2.dtype, (np.ndarray, np.ndarray))

        X2 = X.transform(lambda x: tuple((x ** 2).tolist()), column=0,
                         dtype=tuple)
        assert_equal(X2.dtype, (tuple, np.ndarray))
        assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray}))

        X2 = X.transform(lambda x: x ** 2, column=1, dtype=list)
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda a, b: (a ** 2, (b ** 0.5).tolist()),
                         column=[0, 1], dtype=(np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda b, a: ((b ** 0.5).tolist(), a ** 2),
                         column=[1, 0], dtype=(list, np.ndarray))
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
예제 #24
0
    def test_transform_with_dtype(self):
        data1 = np.arange(400).reshape((100, 4))
        data2 = np.arange(200).reshape((100, 2))
        rdd1 = self.sc.parallelize(data1, 4)
        rdd2 = self.sc.parallelize(data2, 4)

        X = DictRDD(rdd1.zip(rdd2), bsize=5)

        X2 = X.transform(lambda x: x**2, column=0)
        assert_equal(X2.dtype, (np.ndarray, np.ndarray))

        X2 = X.transform(lambda x: tuple((x**2).tolist()),
                         column=0,
                         dtype=tuple)
        assert_equal(X2.dtype, (tuple, np.ndarray))
        assert_true(check_rdd_dtype(X2, {0: tuple, 1: np.ndarray}))

        X2 = X.transform(lambda x: x**2, column=1, dtype=list)
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda a, b: (a**2, (b**0.5).tolist()),
                         column=[0, 1],
                         dtype=(np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))

        X2 = X.transform(lambda b, a: ((b**0.5).tolist(), a**2),
                         column=[1, 0],
                         dtype=(list, np.ndarray))
        assert_equal(X2.dtype, (np.ndarray, list))
        assert_true(check_rdd_dtype(X2, {0: np.ndarray, 1: list}))
예제 #25
0
    def test_creation(self):
        rdd = self.generate()

        blocked = BlockRDD(rdd)
        assert_is_instance(blocked, BlockRDD)
        assert_equal(blocked.first(), range(10))
        assert_equal(blocked.collect(), np.arange(100).reshape(10, 10).tolist())

        blocked = BlockRDD(rdd, bsize=4)
        assert_is_instance(blocked, BlockRDD)
        assert_equal(blocked.first(), range(4))
        assert_equal([len(x) for x in blocked.collect()], [4, 4, 2] * 10)
예제 #26
0
    def test_limit_features(self):
        X, X_rdd = self.make_text_rdd()

        params = [{'min_df': .5},
                  {'min_df': 2, 'max_df': .9},
                  {'min_df': 1, 'max_df': .6},
                  {'min_df': 2, 'max_features': 3}]

        for paramset in params:
            local = CountVectorizer(**paramset)
            dist = SparkCountVectorizer(**paramset)

            result_local = local.fit_transform(X)
            result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())

            assert_equal(local.vocabulary_, dist.vocabulary_)
            assert_array_equal(result_local.toarray(), result_dist.toarray())

            result_dist = sp.vstack(dist.transform(X_rdd).collect())
            assert_array_equal(result_local.toarray(), result_dist.toarray())
예제 #27
0
    def test_auto_dtype(self):
        x = np.arange(80).reshape((40, 2))
        y = tuple(range(40))
        z = list(range(40))
        x_rdd = self.sc.parallelize(x, 4)
        y_rdd = self.sc.parallelize(y, 4)
        z_rdd = self.sc.parallelize(z, 4)

        expected = (np.arange(20).reshape(10, 2), tuple(range(10)),
                    list(range(10)))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd])
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {0: np.ndarray, 1: tuple, 2: tuple}))

        rdd = DictRDD([x_rdd, y_rdd, z_rdd], columns=('x', 'y', 'z'))
        assert_tuple_equal(rdd.first(), expected)
        assert_equal(rdd.dtype, (np.ndarray, tuple, tuple))
        assert_true(check_rdd_dtype(rdd, {'x': np.ndarray, 'y': tuple,
                                          'z': tuple}))
예제 #28
0
 def test_partition_number(self):
     blocked = BlockRDD(self.generate(1000, 5), bsize=50)
     assert_equal(blocked.partitions, 5)
     blocked = BlockRDD(self.generate(621, 3), bsize=45)
     assert_equal(blocked.partitions, 3)
     blocked = BlockRDD(self.generate(100, 10))
     assert_equal(blocked.partitions, 10)
예제 #29
0
 def test_partition_number(self):
     blocked = BlockRDD(self.generate(1000, 5), bsize=50)
     assert_equal(blocked.partitions, 5)
     blocked = BlockRDD(self.generate(621, 3), bsize=45)
     assert_equal(blocked.partitions, 3)
     blocked = BlockRDD(self.generate(100, 10))
     assert_equal(blocked.partitions, 10)
예제 #30
0
    def test_creation(self):
        rdd = self.generate()

        blocked = BlockRDD(rdd)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(10))
        assert_equal(blocked.first(), expected)
        expected = [tuple(v) for v in np.arange(100).reshape(10, 10)]
        assert_equal(blocked.collect(), expected)

        blocked = BlockRDD(rdd, bsize=4)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(4))
        assert_equal(blocked.first(), expected)
        expected = [4, 4, 2] * 10
        assert_equal([len(x) for x in blocked.collect()], expected)
예제 #31
0
    def test_creation(self):
        rdd = self.generate()

        blocked = BlockRDD(rdd)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(10))
        assert_equal(blocked.first(), expected)
        expected = [tuple(v) for v in np.arange(100).reshape(10, 10)]
        assert_equal(blocked.collect(), expected)

        blocked = BlockRDD(rdd, bsize=4)
        assert_is_instance(blocked, BlockRDD)
        expected = tuple(range(4))
        assert_equal(blocked.first(), expected)
        expected = [4, 4, 2] * 10
        assert_equal([len(x) for x in blocked.collect()], expected)
예제 #32
0
    def test_dtype(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize(["lorem" for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=list)
        assert_array_equal(["lorem"] * 10, blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(["lorem"] * 10, blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([1 for i in range(n_samples)], n_partitions)
        blocked_data = block(data, dtype=tuple)
        assert_array_equal(tuple([1] * (n_samples // n_partitions)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
예제 #33
0
    def test_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((n_samples // n_partitions, 1)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
예제 #34
0
    def test_dtype(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize(["lorem" for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=list)
        assert_array_equal(["lorem"] * 10, blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(["lorem"] * 10, blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([1 for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data, dtype=tuple)
        assert_array_equal(tuple([1] * (n_samples / n_partitions)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
예제 #35
0
    def test_array(self):
        n_partitions = 10
        n_samples = 100
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((10, 1)), blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_array_equal(np.ones((10, 1)), blocks[-1])
        assert_equal(sum(len(b) for b in blocks), n_samples)

        n_partitions = 17
        data = self.sc.parallelize([np.array([1]) for i in range(n_samples)],
                                   n_partitions)
        blocked_data = block(data)
        assert_array_equal(np.ones((n_samples / n_partitions, 1)),
                           blocked_data.first())
        blocks = blocked_data.collect()
        assert_equal(len(blocks), n_partitions)
        assert_equal(sum(len(b) for b in blocks), n_samples)
예제 #36
0
    def test_tolist(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, range(1000))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, range(1000))

        blocked = BlockRDD(self.generate(1000, 5), dtype=np.array)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, range(1000))
예제 #37
0
    def test_tolist(self):
        blocked = BlockRDD(self.generate(1000, 5))
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, list(range(1000)))

        blocked = BlockRDD(self.generate(1000, 5), dtype=tuple)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, list(range(1000)))

        blocked = BlockRDD(self.generate(1000, 5), dtype=np.array)
        unblocked = blocked.tolist()
        assert_is_instance(blocked, BlockRDD)
        assert_equal(unblocked, list(range(1000)))
예제 #38
0
    def test_block_rdd_tuple(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize([(np.array([1., 2.]), 0, sparse_row)
                                    for i in range(n_samples)], n_partitions)
        blocked_data = block(data)

        expected_first_block = np.array([[1., 2.]] * 10)
        expected_second_block = np.zeros(10, dtype=np.int)
        expected_third_block = sp.vstack([sparse_row] * 10)

        first_block_tuple = blocked_data.first()
        assert_array_almost_equal(expected_first_block, first_block_tuple[0])
        assert_array_almost_equal(expected_second_block, first_block_tuple[1])
        assert_array_almost_equal(expected_third_block.toarray(),
                                  first_block_tuple[2].toarray())

        tuple_blocks = blocked_data.collect()
        assert_equal(len(tuple_blocks), n_partitions)
        assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples)
        assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
예제 #39
0
    def test_block_rdd_tuple(self):
        n_partitions = 10
        n_samples = 100
        sparse_row = sp.csr_matrix([[0, 0, 1, 0, 1]])
        data = self.sc.parallelize(
            [(np.array([1., 2.]), 0, sparse_row) for i in range(n_samples)],
            n_partitions)
        blocked_data = block(data)

        expected_first_block = np.array([[1., 2.]] * 10)
        expected_second_block = np.zeros(10, dtype=np.int)
        expected_third_block = sp.vstack([sparse_row] * 10)

        first_block_tuple = blocked_data.first()
        assert_array_almost_equal(expected_first_block, first_block_tuple[0])
        assert_array_almost_equal(expected_second_block, first_block_tuple[1])
        assert_array_almost_equal(expected_third_block.toarray(),
                                  first_block_tuple[2].toarray())

        tuple_blocks = blocked_data.collect()
        assert_equal(len(tuple_blocks), n_partitions)
        assert_equal(sum(len(b[0]) for b in tuple_blocks), n_samples)
        assert_equal(sum(len(b[1]) for b in tuple_blocks), n_samples)
예제 #40
0
    def test_blocks_number(self):
        n_partitions = 10
        n_samples = 1000

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_equal(1000, ArrayRDD(rdd, noblock=True, bsize=1).blocks)
        assert_equal(10, ArrayRDD(rdd).blocks)
        assert_equal(20, ArrayRDD(rdd, 50).blocks)
        assert_equal(20, ArrayRDD(rdd, 66).blocks)
        assert_equal(10, ArrayRDD(rdd, 100).blocks)
        assert_equal(10, ArrayRDD(rdd, 300).blocks)
        assert_equal(200, ArrayRDD(rdd, 5).blocks)
        assert_equal(100, ArrayRDD(rdd, 10).blocks)
예제 #41
0
 def test_mean(self):
     data = np.arange(600).reshape((100, 3, 2))
     rdd = self.sc.parallelize(data)
     assert_equal(ArrayRDD(rdd).mean(), data.mean())
     assert_array_equal(ArrayRDD(rdd).mean(axis=0), data.mean(axis=0))
     assert_array_equal(ArrayRDD(rdd).mean(axis=1), data.mean(axis=1))
예제 #42
0
    def test_partitions_number(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        assert_equal(ArrayRDD(rdd, 5).partitions, 4)
        assert_equal(ArrayRDD(rdd, 10).partitions, 4)
        assert_equal(ArrayRDD(rdd, 20).partitions, 4)

        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 7)
        assert_equal(ArrayRDD(rdd, 5).partitions, 7)
        assert_equal(ArrayRDD(rdd, 10).partitions, 7)
        assert_equal(ArrayRDD(rdd, 20).partitions, 7)
예제 #43
0
    def test_blocks_number(self):
        n_partitions = 10
        n_samples = 1000

        data = [np.array([1, 2]) for i in range(n_samples)]
        rdd = self.sc.parallelize(data, n_partitions)

        assert_equal(1000, ArrayRDD(rdd, noblock=True, bsize=1).blocks)
        assert_equal(10, ArrayRDD(rdd).blocks)
        assert_equal(20, ArrayRDD(rdd, 50).blocks)
        assert_equal(20, ArrayRDD(rdd, 66).blocks)
        assert_equal(10, ArrayRDD(rdd, 100).blocks)
        assert_equal(10, ArrayRDD(rdd, 300).blocks)
        assert_equal(200, ArrayRDD(rdd, 5).blocks)
        assert_equal(100, ArrayRDD(rdd, 10).blocks)
예제 #44
0
    def test_partitions_number(self):
        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 4)
        assert_equal(ArrayRDD(rdd, 5).partitions, 4)
        assert_equal(ArrayRDD(rdd, 10).partitions, 4)
        assert_equal(ArrayRDD(rdd, 20).partitions, 4)

        data = np.arange(400).reshape((100, 4))
        rdd = self.sc.parallelize(data, 7)
        assert_equal(ArrayRDD(rdd, 5).partitions, 7)
        assert_equal(ArrayRDD(rdd, 10).partitions, 7)
        assert_equal(ArrayRDD(rdd, 20).partitions, 7)