예제 #1
0
 def test_huge_dataset(self):
     m = ExternalMerger(self.agg, 10)
     m.mergeCombiners(map(lambda (k, v): (k, [str(v)]), self.data * 10))
     self.assertTrue(m.spills >= 1)
     self.assertEqual(sum(len(v) for k, v in m._recursive_merged_items(0)),
                      self.N * 10)
     m._cleanup()
예제 #2
0
 def test_simple_aggregator_with_medium_dataset(self):
     # SPARK-39179: Test Simple aggregator
     agg = SimpleAggregator(lambda x, y: x + y)
     m = ExternalMerger(agg, 20)
     m.mergeValues(self.data)
     self.assertTrue(m.spills >= 1)
     self.assertEqual(sum(v for k, v in m.items()), sum(range(self.N)))
예제 #3
0
    def test_shuffle_data_with_multiple_locations(self):
        # SPARK-39179: Test shuffle of data with multiple location also check
        # shuffle locations get randomized

        with tempfile.TemporaryDirectory(
        ) as tempdir1, tempfile.TemporaryDirectory() as tempdir2:
            original = os.environ.get("SPARK_LOCAL_DIRS", None)
            os.environ["SPARK_LOCAL_DIRS"] = tempdir1 + "," + tempdir2
            try:
                index_of_tempdir1 = [False, False]
                for idx in range(10):
                    m = ExternalMerger(self.agg, 20)
                    if m.localdirs[0].startswith(tempdir1):
                        index_of_tempdir1[0] = True
                    elif m.localdirs[1].startswith(tempdir1):
                        index_of_tempdir1[1] = True
                    m.mergeValues(self.data)
                    self.assertTrue(m.spills >= 1)
                    self.assertEqual(sum(sum(v) for k, v in m.items()),
                                     sum(range(self.N)))
                self.assertTrue(index_of_tempdir1[0] and
                                (index_of_tempdir1[0] == index_of_tempdir1[1]))
            finally:
                if original is not None:
                    os.environ["SPARK_LOCAL_DIRS"] = original
                else:
                    del os.environ["SPARK_LOCAL_DIRS"]
예제 #4
0
    def test_small_dataset(self):
        m = ExternalMerger(self.agg, 1000)
        m.mergeValues(self.data)
        self.assertEqual(m.spills, 0)
        self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)))

        m = ExternalMerger(self.agg, 1000)
        m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data))
        self.assertEqual(m.spills, 0)
        self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)))
예제 #5
0
    def test_stopiteration_is_raised(self):

        def stopit(*args, **kwargs):
            raise StopIteration()

        def legit_create_combiner(x):
            return [x]

        def legit_merge_value(x, y):
            return x.append(y) or x

        def legit_merge_combiners(x, y):
            return x.extend(y) or x

        data = [(x % 2, x) for x in range(100)]

        # wrong create combiner
        m = ExternalMerger(Aggregator(stopit, legit_merge_value, legit_merge_combiners), 20)
        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
            m.mergeValues(data)

        # wrong merge value
        m = ExternalMerger(Aggregator(legit_create_combiner, stopit, legit_merge_combiners), 20)
        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
            m.mergeValues(data)

        # wrong merge combiners
        m = ExternalMerger(Aggregator(legit_create_combiner, legit_merge_value, stopit), 20)
        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
            m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))
예제 #6
0
    def test_medium_dataset(self):
        m = ExternalMerger(self.agg, 20)
        m.mergeValues(self.data)
        self.assertTrue(m.spills >= 1)
        self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)))

        m = ExternalMerger(self.agg, 10)
        m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3))
        self.assertTrue(m.spills >= 1)
        self.assertEqual(sum(sum(v) for k, v in m.items()),
                         sum(range(self.N)) * 3)
예제 #7
0
 def test_huge_dataset(self):
     m = ExternalMerger(self.agg, 5, partitions=3)
     m.mergeCombiners(
         map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10))
     self.assertTrue(m.spills >= 1)
     self.assertEqual(sum(len(v) for k, v in m.items()), self.N * 10)
     m._cleanup()
예제 #8
0
파일: tests.py 프로젝트: fireflyc/spark
    def test_medium_dataset(self):
        m = ExternalMerger(self.agg, 10)
        m.mergeValues(self.data)
        self.assertTrue(m.spills >= 1)
        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
                sum(xrange(self.N)))

        m = ExternalMerger(self.agg, 10)
        m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data * 3))
        self.assertTrue(m.spills >= 1)
        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
                sum(xrange(self.N)) * 3)
예제 #9
0
파일: tests.py 프로젝트: fireflyc/spark
    def test_small_dataset(self):
        m = ExternalMerger(self.agg, 1000)
        m.mergeValues(self.data)
        self.assertEqual(m.spills, 0)
        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
                sum(xrange(self.N)))

        m = ExternalMerger(self.agg, 1000)
        m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data))
        self.assertEqual(m.spills, 0)
        self.assertEqual(sum(sum(v) for k, v in m.iteritems()),
                sum(xrange(self.N)))
예제 #10
0
파일: tests.py 프로젝트: fireflyc/spark
 def test_huge_dataset(self):
     m = ExternalMerger(self.agg, 10)
     m.mergeCombiners(map(lambda (k, v): (k, [str(v)]), self.data * 10))
     self.assertTrue(m.spills >= 1)
     self.assertEqual(sum(len(v) for k, v in m._recursive_merged_items(0)),
             self.N * 10)
     m._cleanup()
예제 #11
0
 def test_huge_dataset(self):
     m = ExternalMerger(self.agg, 5, partitions=3)
     m.mergeCombiners(map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10))
     self.assertTrue(m.spills >= 1)
     self.assertEqual(sum(len(v) for k, v in m.items()),
                      self.N * 10)
     m._cleanup()
예제 #12
0
    def test_stopiteration_is_raised(self):
        def stopit(*args, **kwargs):
            raise StopIteration()

        def legit_create_combiner(x):
            return [x]

        def legit_merge_value(x, y):
            return x.append(y) or x

        def legit_merge_combiners(x, y):
            return x.extend(y) or x

        data = [(x % 2, x) for x in range(100)]

        # wrong create combiner
        m = ExternalMerger(
            Aggregator(stopit, legit_merge_value, legit_merge_combiners), 20)
        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
            m.mergeValues(data)

        # wrong merge value
        m = ExternalMerger(
            Aggregator(legit_create_combiner, stopit, legit_merge_combiners),
            20)
        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
            m.mergeValues(data)

        # wrong merge combiners
        m = ExternalMerger(
            Aggregator(legit_create_combiner, legit_merge_value, stopit), 20)
        with self.assertRaises((Py4JJavaError, RuntimeError)) as cm:
            m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))