Exemplo n.º 1
0
 def test_set_if_missing(self):
     conf = SparkConf()
     conf.set(self.RANDOM_KEY, self.RANDOM_VALUE)
     conf.setIfMissing(self.RANDOM_KEY, self.RANDOM_VALUE2)
     self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE)
     conf.setIfMissing(self.RANDOM_KEY2, self.RANDOM_VALUE2)
     self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
Exemplo n.º 2
0
 def test_contains(self):
     conf = SparkConf()
     conf.setAll(
         pairs=[(self.RANDOM_KEY, self.RANDOM_VALUE),
                (self.RANDOM_KEY2, self.RANDOM_VALUE2)]
     )
     self.assertTrue(conf.contains(self.RANDOM_KEY))
     self.assertTrue(conf.contains(self.RANDOM_KEY2))
Exemplo n.º 3
0
 def test_set_all(self):
     conf = SparkConf()
     conf.setAll(
         pairs=[(self.RANDOM_KEY, self.RANDOM_VALUE),
                (self.RANDOM_KEY2, self.RANDOM_VALUE2)]
     )
     self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE)
     self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
Exemplo n.º 4
0
 def test_set_executor_env2(self):
     conf = SparkConf()
     conf.setExecutorEnv(
         key=self.RANDOM_KEY,
         value=self.RANDOM_VALUE,
         pairs=[(self.RANDOM_KEY2, self.RANDOM_VALUE2)]
     )
     self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE)
     self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
Exemplo n.º 5
0
 def test_set_if_missing(self):
     conf = SparkConf()
     conf.set(self.RANDOM_KEY, self.RANDOM_VALUE)
     conf.setIfMissing(self.RANDOM_KEY, self.RANDOM_VALUE2)
     self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE)
     conf.setIfMissing(self.RANDOM_KEY2, self.RANDOM_VALUE2)
     self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
Exemplo n.º 6
0
    def test_named_properties(self):
        conf = SparkConf()

        conf.setMaster(self.RANDOM_VALUE)
        self.assertEquals(conf.get('master'), self.RANDOM_VALUE)

        conf.setAppName(self.RANDOM_VALUE)
        self.assertEquals(conf.get('appName'), self.RANDOM_VALUE)

        conf.setSparkHome(self.RANDOM_VALUE)
        self.assertEquals(conf.get('sparkHome'), self.RANDOM_VALUE)
Exemplo n.º 7
0
def spark_ctx():
    """A simple spark context."""

    if IF_DUMMY_SPARK:
        from dummy_spark import SparkConf, SparkContext
        conf = SparkConf()
        ctx = SparkContext(master='', conf=conf)
    else:
        from pyspark import SparkConf, SparkContext
        conf = SparkConf().setMaster('local[2]').setAppName('drudge-unittest')
        ctx = SparkContext(conf=conf)

    return ctx
Exemplo n.º 8
0
    def test_word_count_3(self):

        lines = [
            'apple',
            'apple banana',
            'apple banana',
            'apple banana grape',
            'banana grape',
            'banana'
        ]

        expected_output = [
            ('apple', 4),
            ('banana', 5),
            ('grape', 2),
        ]

        sc = SparkContext(master='', conf=SparkConf())

        rdd = sc.parallelize(lines)
        rdd = rdd.flatMap(lambda x: x.split(' '))
        rdd = rdd.map(lambda word: (word, 1))
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        output = rdd.collect()
        self.assertEquals(sorted(output), sorted(expected_output))
Exemplo n.º 9
0
 def test_left_outer_join(self):
     sc = SparkContext(master='', conf=SparkConf())
     rdd1 = sc.parallelize([('A', [1, 2, 3]), ('B', [2,3,4])])
     rdd2 = sc.parallelize([('A', [1, 2, 3]), ('B', [2,3,4]), ('B', [4,5,6])])
     out = rdd1.leftOuterJoin(rdd2).collect()
     print(out)
     self.assertEqual(len(out), 2)
Exemplo n.º 10
0
    def test_combineByKey(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = sc.parallelize([
            ('A', 1),
            ('B', 2),
            ('B', 3),
            ('C', 4),
            ('C', 5),
            ('A', 6),
        ])

        def create_combiner(a):
            return [a]

        def merge_value(a, b):
            a.append(b)
            return a

        def merge_combiners(a, b):
            a.extend(b)
            return a

        rdd = rdd.combineByKey(create_combiner, merge_value, merge_combiners)
        self.assertListEqual(
            rdd.collect(),
            [('A', [1, 6]), ('B', [2, 3]), ('C', [4, 5])],
        )
Exemplo n.º 11
0
def test_minion_perform_deliver_success():
    workflow_id = '6666'
    app_id = '1000'
    job_id = '1'
    out_queue = 'queue_2000'
    sconf = SparkConf()
    sc = SparkContext(master='', conf=sconf)

    rdd = sc.parallelize(get_records())

    df0 = DataFrame(rdd=rdd)
    with mock.patch('redis.StrictRedis',
                    mock_strict_redis_client) as mocked_redis:
        redis_conn = mocked_redis()
        state_control = StateControlRedis(redis_conn)

        data = {
            'workflow_id': workflow_id,
            'app_id': app_id,
            'job_id': job_id,
            'type': 'deliver',
            'task_id': '033f-284ab-28987e',
            'port': 'port0',
            'output': out_queue,
            'workflow': ''
        }
        state_control.push_app_queue(app_id, json.dumps(data))
        minion = SparkMinion(redis_conn=redis_conn,
                             workflow_id=workflow_id,
                             app_id=app_id,
                             config=config)
        minion._emit_event = dummy_emit_event
        minion._state = {
            data['task_id']: {
                'port0': {
                    'output': df0,
                    'sample': []
                },
                'time': 35.92
            }
        }
        minion._process_message()

        # Discard first status message
        state_control.pop_app_output_queue(app_id, False)

        msg = json.loads(state_control.pop_app_output_queue(app_id, False))
        assert msg['status'] == 'SUCCESS', 'Invalid status'
        assert msg['code'] == minion.MNN002[0], 'Invalid code'

        # CSV data
        csv_records = '\n'.join(
            map(dataframe_util.convert_to_csv, get_records()))

        result = json.loads(state_control.pop_queue(out_queue, False))
        assert result['sample'] == csv_records, 'Wrong CSV generated'
Exemplo n.º 12
0
 def test_set_executor_env2(self):
     conf = SparkConf()
     conf.setExecutorEnv(key=self.RANDOM_KEY,
                         value=self.RANDOM_VALUE,
                         pairs=[(self.RANDOM_KEY2, self.RANDOM_VALUE2)])
     self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE)
     self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
Exemplo n.º 13
0
 def test_set_all(self):
     conf = SparkConf()
     conf.setAll(
         pairs=[(self.RANDOM_KEY,
                 self.RANDOM_VALUE), (self.RANDOM_KEY2,
                                      self.RANDOM_VALUE2)])
     self.assertEquals(conf.get(self.RANDOM_KEY), self.RANDOM_VALUE)
     self.assertEquals(conf.get(self.RANDOM_KEY2), self.RANDOM_VALUE2)
Exemplo n.º 14
0
 def test_contains(self):
     conf = SparkConf()
     conf.setAll(
         pairs=[(self.RANDOM_KEY,
                 self.RANDOM_VALUE), (self.RANDOM_KEY2,
                                      self.RANDOM_VALUE2)])
     self.assertTrue(conf.contains(self.RANDOM_KEY))
     self.assertTrue(conf.contains(self.RANDOM_KEY2))
Exemplo n.º 15
0
 def test_sortByKey_descending(self):
     sc = SparkContext(master='', conf=SparkConf())
     rdd = (sc.parallelize([
         ('a', 1),
         ('b', 2),
         ('c', 3),
         ('d', 4),
         ('e', 5),
     ]).sortByKey(ascending=False))
     self.assertListEqual(
         rdd.collect(),
         [
             ('e', 5),
             ('d', 4),
             ('c', 3),
             ('b', 2),
             ('a', 1),
         ],
     )
Exemplo n.º 16
0
# -*- coding: utf-8 -*-

import os
import random

from dummy_spark import SparkContext, SparkConf
from dummy_spark.sql import SQLContext

__author__ = 'willmcginnis'

# make a spark conf
sconf = SparkConf()

# set some property (won't do anything)
sconf.set('spark.executor.extraClassPath', 'foo')

# use the spark conf to make a spark context
sc = SparkContext(master='', conf=sconf)

# set the log level (also doesn't do anything)
sc.setLogLevel('INFO')

# maybe make a useless sqlcontext (nothing implimented here yet)
sqlctx = SQLContext(sc)

# add pyfile just appends to the sys path
sc.addPyFile(os.path.dirname(__file__))

# do some hadoop configuration into the ether
sc._jsc.hadoopConfiguration().set('foo', 'bar')
Exemplo n.º 17
0
class RDDTests(unittest.TestCase):

    SPARK_CONTEXT = SparkContext(master='', conf=SparkConf())
    TEST_RANGES = [
        (0, 0, 1),
        (0, 10, 1),
        (0, 10, 2),
        (0, 100, 13),
        (0, 1000, 17),
        (0, 10000, 31),
    ]
    SAMPLE_FRACTION = 0.10
    SAMPLE_SEED = 1234

    def test_init(self):
        for start, stop, step in self.TEST_RANGES:
            l = list(range(start, stop, step))
            rdd = RDD(l, self.SPARK_CONTEXT)
            self.assertEquals(l, rdd.collect())

            s = set(range(100))
            rdd = RDD(s, self.SPARK_CONTEXT)
            self.assertEquals(sorted(list(s)), sorted(rdd.collect()))

        t = (1, 2, 3)
        with self.assertRaises(AttributeError):
            RDD(t, self.SPARK_CONTEXT)

        with self.assertRaises(AttributeError):
            RDD('', self.SPARK_CONTEXT)

    def test_ctx(self):
        rdd = RDD([], self.SPARK_CONTEXT)
        self.assertEquals(rdd.ctx, self.SPARK_CONTEXT)

    @staticmethod
    def square(x):
        return x**2

    def test_map(self):
        for start, stop, step in self.TEST_RANGES:
            l1 = range(start, stop, step)
            l2 = map(RDDTests.square, l1)
            rdd = RDD(list(l1), self.SPARK_CONTEXT)
            rdd = rdd.map(RDDTests.square)
            self.assertEquals(rdd.collect(), list(l2))

    @staticmethod
    def triplicate(x):
        return [x, x, x]

    def test_flat_map(self):
        for start, stop, step in self.TEST_RANGES:
            l1 = range(start, stop, step)
            l2 = map(RDDTests.triplicate, l1)
            l3 = []
            for sl in l2:
                l3.extend(sl)
            rdd = RDD(list(l1), self.SPARK_CONTEXT)
            rdd = rdd.flatMap(RDDTests.triplicate)
            self.assertEquals(rdd.collect(), list(l3))

    @staticmethod
    def is_square(x):
        return x == x**2

    def test_filter(self):
        for start, stop, step in self.TEST_RANGES:
            l1 = range(start, stop, step)
            l2 = filter(RDDTests.is_square, l1)
            rdd = RDD(list(l1), self.SPARK_CONTEXT)
            rdd = rdd.filter(RDDTests.is_square)
            self.assertEquals(rdd.collect(), list(l2))

    @staticmethod
    def return_one(x):
        return x - x + 1

    def test_distinct(self):
        for start, stop, step in self.TEST_RANGES:
            l = range(start, stop, step)
            rdd = RDD(list(l), self.SPARK_CONTEXT)
            rdd = rdd.map(RDDTests.return_one)
            rdd = rdd.distinct()
            if len(l) > 0:
                self.assertEquals(rdd.collect(), [1])
            else:
                self.assertEquals(rdd.collect(), [])

    def test_sample_with_replacement(self):
        for start, stop, step in self.TEST_RANGES:
            l = range(start, stop, step)
            rdd = RDD(list(l), self.SPARK_CONTEXT)
            sample = rdd.sample(True, self.SAMPLE_FRACTION).collect()
            self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
            for item in sample:
                self.assertTrue(item in l)

    def test_sample_with_replacement_with_seed(self):
        for start, stop, step in self.TEST_RANGES:
            l = range(start, stop, step)
            rdd = RDD(list(l), self.SPARK_CONTEXT)
            sample1 = rdd.sample(True, self.SAMPLE_FRACTION,
                                 self.SAMPLE_SEED).collect()
            sample2 = rdd.sample(True, self.SAMPLE_FRACTION,
                                 self.SAMPLE_SEED).collect()
            self.assertEquals(sorted(sample1), sorted(sample2))
            sample = sample1
            self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
            for item in sample:
                self.assertTrue(item in l)

    def test_sample_without_replacement(self):
        for start, stop, step in self.TEST_RANGES:
            l = range(start, stop, step)
            rdd = RDD(list(l), self.SPARK_CONTEXT)
            sample = rdd.sample(False, self.SAMPLE_FRACTION).collect()
            self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
            self.assertEquals(sorted(l), sorted(set(l)))
            for item in sample:
                self.assertTrue(item in l)

    def test_sample_without_replacement_with_seed(self):
        for start, stop, step in self.TEST_RANGES:
            l = range(start, stop, step)
            rdd = RDD(list(l), self.SPARK_CONTEXT)
            sample1 = rdd.sample(False, self.SAMPLE_FRACTION,
                                 self.SAMPLE_SEED).collect()
            sample2 = rdd.sample(False, self.SAMPLE_FRACTION,
                                 self.SAMPLE_SEED).collect()
            self.assertEquals(sorted(sample1), sorted(sample2))
            sample = sample1
            self.assertEquals(len(sample), int(len(l) * self.SAMPLE_FRACTION))
            self.assertEquals(sorted(l), sorted(set(l)))
            for item in sample:
                self.assertTrue(item in l)

    def test_union(self):
        for start1, stop1, step1 in self.TEST_RANGES:
            for start2, stop2, step2 in self.TEST_RANGES:
                l1 = range(start1, stop1, step1)
                l2 = range(start2, stop2, step2)
                rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
                rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
                rdd = rdd1.union(rdd2)
                self.assertEquals(sorted(rdd.collect()),
                                  sorted(list(l1) + list(l2)))

    def test_intersection(self):
        for start1, stop1, step1 in self.TEST_RANGES:
            for start2, stop2, step2 in self.TEST_RANGES:
                l1 = range(start1, stop1, step1)
                l2 = range(start2, stop2, step2)
                rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
                rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
                rdd = rdd1.intersection(rdd2)
                self.assertEquals(sorted(rdd.collect()),
                                  sorted([x for x in l1 if x in l2]))

    def test_group_by_key(self):
        l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
        rdd = RDD(l, self.SPARK_CONTEXT)
        rdd = rdd.groupByKey()
        r = rdd.collect()
        r = [(kv[0], list(kv[1])) for kv in r]
        self.assertEquals(sorted(r),
                          sorted([(1, [1]), (2, [1, 2]), (3, [1, 2, 3])]))

    def test_reduce_by_key(self):
        l = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
        rdd = RDD(l, self.SPARK_CONTEXT)
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        print(rdd)

        self.assertEquals(sorted(rdd.collect()),
                          sorted([(1, 1), (2, 3), (3, 6)]))

    def test_cartesian(self):
        for start1, stop1, step1 in self.TEST_RANGES:
            for start2, stop2, step2 in self.TEST_RANGES:
                l1 = range(start1, stop1, step1)
                l2 = range(start2, stop2, step2)
                rdd1 = RDD(list(l1), self.SPARK_CONTEXT)
                rdd2 = RDD(list(l2), self.SPARK_CONTEXT)
                rdd = rdd1.cartesian(rdd2)
                r = rdd.collect()
                self.assertEquals(len(r), len(l1) * len(l2))
                for t, u in r:
                    self.assertTrue(t in l1)
                    self.assertTrue(u in l2)

    def test_cogroup(self):
        l1 = [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2), (3, 3)]
        l2 = [(2, 10), (2, 20), (3, 10), (3, 20), (3, 30), (4, 40)]
        rdd1 = RDD(l1, self.SPARK_CONTEXT)
        rdd2 = RDD(l2, self.SPARK_CONTEXT)
        rdd = rdd1.cogroup(rdd2)
        l = rdd.collect()
        self.assertEquals(
            sorted(l),
            sorted([(1, [1], []), (2, [1, 2], [10, 20]),
                    (3, [1, 2, 3], [10, 20, 30]), (4, [], [40])]))

    def test_word_count_1(self):

        lines = [
            'grape banana apple',
        ]

        expected_output = [
            ('apple', 1),
            ('banana', 1),
            ('grape', 1),
        ]

        sc = SparkContext(master='', conf=SparkConf())

        rdd = sc.parallelize(lines)
        rdd = rdd.flatMap(lambda x: x.split(' '))
        rdd = rdd.map(lambda word: (word, 1))
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        output = rdd.collect()
        self.assertEquals(sorted(output), sorted(expected_output))

    def test_word_count_2(self):

        lines = [
            'apple',
            'apple banana',
            'apple banana',
            'apple banana grape',
        ]

        expected_output = [
            ('apple', 4),
            ('banana', 3),
            ('grape', 1),
        ]

        sc = SparkContext(master='', conf=SparkConf())

        rdd = sc.parallelize(lines)
        rdd = rdd.flatMap(lambda x: x.split(' '))
        rdd = rdd.map(lambda word: (word, 1))
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        output = rdd.collect()
        self.assertEquals(sorted(output), sorted(expected_output))

    def test_word_count_3(self):

        lines = [
            'apple', 'apple banana', 'apple banana', 'apple banana grape',
            'banana grape', 'banana'
        ]

        expected_output = [
            ('apple', 4),
            ('banana', 5),
            ('grape', 2),
        ]

        sc = SparkContext(master='', conf=SparkConf())

        rdd = sc.parallelize(lines)
        rdd = rdd.flatMap(lambda x: x.split(' '))
        rdd = rdd.map(lambda word: (word, 1))
        rdd = rdd.reduceByKey(lambda a, b: a + b)

        output = rdd.collect()
        self.assertEquals(sorted(output), sorted(expected_output))

    def test_left_outer_join(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd1 = sc.parallelize([('A', [1, 2, 3]), ('B', [2, 3, 4])])
        rdd2 = sc.parallelize([('A', [1, 2, 3]), ('B', [2, 3, 4]),
                               ('B', [4, 5, 6])])
        out = rdd1.leftOuterJoin(rdd2).collect()
        print(out)
        self.assertEqual(len(out), 2)

    def test_keys(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)])
        self.assertListEqual(rdd.keys().collect(), ['A', 'B', 'C'])

    def test_values(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)])
        self.assertListEqual(rdd.values().collect(), [1, 2, 3])

    def test_combineByKey(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = sc.parallelize([
            ('A', 1),
            ('B', 2),
            ('B', 3),
            ('C', 4),
            ('C', 5),
            ('A', 6),
        ])

        def create_combiner(a):
            return [a]

        def merge_value(a, b):
            a.append(b)
            return a

        def merge_combiners(a, b):
            a.extend(b)
            return a

        rdd = rdd.combineByKey(create_combiner, merge_value, merge_combiners)
        self.assertListEqual(
            rdd.collect(),
            [('A', [1, 6]), ('B', [2, 3]), ('C', [4, 5])],
        )

    def test_sortByKey_ascending(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = (sc.parallelize([
            ('e', 5),
            ('d', 4),
            ('c', 3),
            ('b', 2),
            ('a', 1),
        ]).sortByKey(ascending=True))
        self.assertListEqual(
            rdd.collect(),
            [
                ('a', 1),
                ('b', 2),
                ('c', 3),
                ('d', 4),
                ('e', 5),
            ],
        )

    def test_sortByKey_descending(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = (sc.parallelize([
            ('a', 1),
            ('b', 2),
            ('c', 3),
            ('d', 4),
            ('e', 5),
        ]).sortByKey(ascending=False))
        self.assertListEqual(
            rdd.collect(),
            [
                ('e', 5),
                ('d', 4),
                ('c', 3),
                ('b', 2),
                ('a', 1),
            ],
        )

    def test_sortBy_ascending(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = (sc.parallelize([5, 4, 3, 2, 1]).sortBy(lambda x: x,
                                                      ascending=True))
        self.assertListEqual(rdd.collect(), [1, 2, 3, 4, 5])

    def test_sortBy_descending(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = (sc.parallelize([1, 2, 3, 4, 5]).sortBy(lambda x: x,
                                                      ascending=False))
        self.assertListEqual(rdd.collect(), [5, 4, 3, 2, 1])

    def test_subtractByKey(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)])
        rdd2 = sc.parallelize([('A', None), ('C', None)])
        self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)])

    def test_not_implemented_methods(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = sc.parallelize([])

        with self.assertRaises(NotImplementedError):
            rdd._pickled()

        with self.assertRaises(NotImplementedError):
            rdd.mapPartitionsWithIndex(
                None,
                None,
            )

        with self.assertRaises(NotImplementedError):
            rdd._computeFractionForSampleSize(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.pipe(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.reduce(None)

        with self.assertRaises(NotImplementedError):
            rdd.treeReduce(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.fold(
                None,
                None,
            )

        with self.assertRaises(NotImplementedError):
            rdd.aggregate(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.treeAggregate(None, None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.stats()

        with self.assertRaises(NotImplementedError):
            rdd.histogram(None)

        with self.assertRaises(NotImplementedError):
            rdd.variance()

        with self.assertRaises(NotImplementedError):
            rdd.stdev()

        with self.assertRaises(NotImplementedError):
            rdd.sampleStdev()

        with self.assertRaises(NotImplementedError):
            rdd.sampleVariance()

        with self.assertRaises(NotImplementedError):
            rdd.countByValue()

        with self.assertRaises(NotImplementedError):
            rdd.top(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.takeOrdered(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsNewAPIHadoopDataset(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None,
                                       None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsHadoopDataset(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsHadoopFile(None, None, None, None, None, None, None,
                                 None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsSequenceFile(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsPickleFile(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsTextFile(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.collectAsMap()

        with self.assertRaises(NotImplementedError):
            rdd.reduceByKeyLocally(None)

        with self.assertRaises(NotImplementedError):
            rdd.countByKey()

        with self.assertRaises(NotImplementedError):
            rdd.join(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.rightOuterJoin(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.fullOuterJoin(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.foldByKey(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd._can_spill()

        with self.assertRaises(NotImplementedError):
            rdd._memory_limit()

        with self.assertRaises(NotImplementedError):
            rdd.groupWith(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.sampleByKey(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.subtract(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.coalesce(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.toDebugString()

        with self.assertRaises(NotImplementedError):
            rdd.getStorageLevel()

        with self.assertRaises(NotImplementedError):
            rdd._to_java_object_rdd()
Exemplo n.º 18
0
    def test_not_implemented_methods(self):
        sc = SparkContext(master='', conf=SparkConf())
        rdd = sc.parallelize([])

        with self.assertRaises(NotImplementedError):
            rdd._pickled()

        with self.assertRaises(NotImplementedError):
            rdd.mapPartitionsWithIndex(
                None,
                None,
            )

        with self.assertRaises(NotImplementedError):
            rdd._computeFractionForSampleSize(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.pipe(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.reduce(None)

        with self.assertRaises(NotImplementedError):
            rdd.treeReduce(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.fold(
                None,
                None,
            )

        with self.assertRaises(NotImplementedError):
            rdd.aggregate(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.treeAggregate(None, None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.stats()

        with self.assertRaises(NotImplementedError):
            rdd.histogram(None)

        with self.assertRaises(NotImplementedError):
            rdd.variance()

        with self.assertRaises(NotImplementedError):
            rdd.stdev()

        with self.assertRaises(NotImplementedError):
            rdd.sampleStdev()

        with self.assertRaises(NotImplementedError):
            rdd.sampleVariance()

        with self.assertRaises(NotImplementedError):
            rdd.countByValue()

        with self.assertRaises(NotImplementedError):
            rdd.top(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.takeOrdered(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsNewAPIHadoopDataset(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsNewAPIHadoopFile(None, None, None, None, None, None,
                                       None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsHadoopDataset(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsHadoopFile(None, None, None, None, None, None, None,
                                 None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsSequenceFile(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsPickleFile(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.saveAsTextFile(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.collectAsMap()

        with self.assertRaises(NotImplementedError):
            rdd.reduceByKeyLocally(None)

        with self.assertRaises(NotImplementedError):
            rdd.countByKey()

        with self.assertRaises(NotImplementedError):
            rdd.join(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.rightOuterJoin(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.fullOuterJoin(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.foldByKey(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd._can_spill()

        with self.assertRaises(NotImplementedError):
            rdd._memory_limit()

        with self.assertRaises(NotImplementedError):
            rdd.groupWith(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.sampleByKey(None, None, None)

        with self.assertRaises(NotImplementedError):
            rdd.subtract(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.coalesce(None, None)

        with self.assertRaises(NotImplementedError):
            rdd.toDebugString()

        with self.assertRaises(NotImplementedError):
            rdd.getStorageLevel()

        with self.assertRaises(NotImplementedError):
            rdd._to_java_object_rdd()
Exemplo n.º 19
0
 def test_subtractByKey(self):
     sc = SparkContext(master='', conf=SparkConf())
     rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)])
     rdd2 = sc.parallelize([('A', None), ('C', None)])
     self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)])
Exemplo n.º 20
0
 def test_sortBy_descending(self):
     sc = SparkContext(master='', conf=SparkConf())
     rdd = (sc.parallelize([1, 2, 3, 4, 5]).sortBy(lambda x: x,
                                                   ascending=False))
     self.assertListEqual(rdd.collect(), [5, 4, 3, 2, 1])
Exemplo n.º 21
0
 def test_get_all(self):
     conf = SparkConf()
     pairs = [(self.RANDOM_KEY, self.RANDOM_VALUE),
            (self.RANDOM_KEY2, self.RANDOM_VALUE2)]
     conf.setAll(pairs)
     self.assertEquals(sorted(conf.getAll()), sorted(pairs))
Exemplo n.º 22
0
 def test_to_debug_string(self):
     conf = SparkConf()
     self.assertEquals(conf.toDebugString(), SparkConf.DEBUG_STRING)
Exemplo n.º 23
0
 def test_values(self):
     sc = SparkContext(master='', conf=SparkConf())
     rdd = sc.parallelize([('A', 1), ('B', 2), ('C', 3)])
     self.assertListEqual(rdd.values().collect(), [1, 2, 3])
Exemplo n.º 24
0
# -*- coding: utf-8 -*-

import os
import random

from dummy_spark import SparkContext, SparkConf
from dummy_spark.sql import SQLContext

__author__ = 'willmcginnis'

# make a spark conf
sconf = SparkConf()

# set some property (won't do anything)
sconf.set('spark.executor.extraClassPath', 'foo')

# use the spark conf to make a spark context
sc = SparkContext(master='', conf=sconf)

# set the log level (also doesn't do anything)
sc.setLogLevel('INFO')

# maybe make a useless sqlcontext (nothing implimented here yet)
sqlctx = SQLContext(sc)

# add pyfile just appends to the sys path
sc.addPyFile(os.path.dirname(__file__))

# do some hadoop configuration into the ether
sc._jsc.hadoopConfiguration().set('foo', 'bar')
Exemplo n.º 25
0
 def test_subtractByKey(self):
     """values method returns the values as expected."""
     sc = SparkContext(master='', conf=SparkConf())
     rdd1 = sc.parallelize([('A', 1), ('B', 2), ('C', 3)])
     rdd2 = sc.parallelize([('A', None), ('C', None)])
     self.assertListEqual(rdd1.subtractByKey(rdd2).collect(), [('B', 2)])
Exemplo n.º 26
0
 def test_to_debug_string(self):
     conf = SparkConf()
     self.assertEquals(conf.toDebugString(), SparkConf.DEBUG_STRING)
Exemplo n.º 27
0
 def test_get_all(self):
     conf = SparkConf()
     pairs = [(self.RANDOM_KEY, self.RANDOM_VALUE),
              (self.RANDOM_KEY2, self.RANDOM_VALUE2)]
     conf.setAll(pairs)
     self.assertEquals(sorted(conf.getAll()), sorted(pairs))
Exemplo n.º 28
0
    def test_named_properties(self):
        conf = SparkConf()

        conf.setMaster(self.RANDOM_VALUE)
        self.assertEquals(conf.get('master'), self.RANDOM_VALUE)

        conf.setAppName(self.RANDOM_VALUE)
        self.assertEquals(conf.get('appName'), self.RANDOM_VALUE)

        conf.setSparkHome(self.RANDOM_VALUE)
        self.assertEquals(conf.get('sparkHome'), self.RANDOM_VALUE)