def test_huge_dataset(self): m = ExternalMerger(self.agg, 10) m.mergeCombiners(map(lambda (k, v): (k, [str(v)]), self.data * 10)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(len(v) for k, v in m._recursive_merged_items(0)), self.N * 10) m._cleanup()
def test_simple_aggregator_with_medium_dataset(self): # SPARK-39179: Test Simple aggregator agg = SimpleAggregator(lambda x, y: x + y) m = ExternalMerger(agg, 20) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(v for k, v in m.items()), sum(range(self.N)))
def test_shuffle_data_with_multiple_locations(self): # SPARK-39179: Test shuffle of data with multiple location also check # shuffle locations get randomized with tempfile.TemporaryDirectory( ) as tempdir1, tempfile.TemporaryDirectory() as tempdir2: original = os.environ.get("SPARK_LOCAL_DIRS", None) os.environ["SPARK_LOCAL_DIRS"] = tempdir1 + "," + tempdir2 try: index_of_tempdir1 = [False, False] for idx in range(10): m = ExternalMerger(self.agg, 20) if m.localdirs[0].startswith(tempdir1): index_of_tempdir1[0] = True elif m.localdirs[1].startswith(tempdir1): index_of_tempdir1[1] = True m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N))) self.assertTrue(index_of_tempdir1[0] and (index_of_tempdir1[0] == index_of_tempdir1[1])) finally: if original is not None: os.environ["SPARK_LOCAL_DIRS"] = original else: del os.environ["SPARK_LOCAL_DIRS"]
def test_small_dataset(self): m = ExternalMerger(self.agg, 1000) m.mergeValues(self.data) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N))) m = ExternalMerger(self.agg, 1000) m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data)) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)))
def test_stopiteration_is_raised(self): def stopit(*args, **kwargs): raise StopIteration() def legit_create_combiner(x): return [x] def legit_merge_value(x, y): return x.append(y) or x def legit_merge_combiners(x, y): return x.extend(y) or x data = [(x % 2, x) for x in range(100)] # wrong create combiner m = ExternalMerger(Aggregator(stopit, legit_merge_value, legit_merge_combiners), 20) with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: m.mergeValues(data) # wrong merge value m = ExternalMerger(Aggregator(legit_create_combiner, stopit, legit_merge_combiners), 20) with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: m.mergeValues(data) # wrong merge combiners m = ExternalMerger(Aggregator(legit_create_combiner, legit_merge_value, stopit), 20) with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))
def test_medium_dataset(self): m = ExternalMerger(self.agg, 20) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N))) m = ExternalMerger(self.agg, 10) m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)) * 3)
def test_huge_dataset(self): m = ExternalMerger(self.agg, 5, partitions=3) m.mergeCombiners( map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(len(v) for k, v in m.items()), self.N * 10) m._cleanup()
def test_medium_dataset(self): m = ExternalMerger(self.agg, 10) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.iteritems()), sum(xrange(self.N))) m = ExternalMerger(self.agg, 10) m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data * 3)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.iteritems()), sum(xrange(self.N)) * 3)
def test_small_dataset(self): m = ExternalMerger(self.agg, 1000) m.mergeValues(self.data) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.iteritems()), sum(xrange(self.N))) m = ExternalMerger(self.agg, 1000) m.mergeCombiners(map(lambda (x, y): (x, [y]), self.data)) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.iteritems()), sum(xrange(self.N)))
def test_huge_dataset(self): m = ExternalMerger(self.agg, 5, partitions=3) m.mergeCombiners(map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(len(v) for k, v in m.items()), self.N * 10) m._cleanup()
def test_stopiteration_is_raised(self): def stopit(*args, **kwargs): raise StopIteration() def legit_create_combiner(x): return [x] def legit_merge_value(x, y): return x.append(y) or x def legit_merge_combiners(x, y): return x.extend(y) or x data = [(x % 2, x) for x in range(100)] # wrong create combiner m = ExternalMerger( Aggregator(stopit, legit_merge_value, legit_merge_combiners), 20) with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: m.mergeValues(data) # wrong merge value m = ExternalMerger( Aggregator(legit_create_combiner, stopit, legit_merge_combiners), 20) with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: m.mergeValues(data) # wrong merge combiners m = ExternalMerger( Aggregator(legit_create_combiner, legit_merge_value, stopit), 20) with self.assertRaises((Py4JJavaError, RuntimeError)) as cm: m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), data))