def test_small_dataset(self): m = ExternalMerger(self.agg, 1000) m.mergeValues(self.data) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N))) m = ExternalMerger(self.agg, 1000) m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data)) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)))
def test_medium_dataset(self): m = ExternalMerger(self.agg, 20) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N))) m = ExternalMerger(self.agg, 10) m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N)) * 3)
def test_medium_dataset(self): m = ExternalMerger(self.agg, 20) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(xrange(self.N))) m = ExternalMerger(self.agg, 10) m.mergeCombiners(map(lambda x_y2: (x_y2[0], [x_y2[1]]), self.data * 3)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(xrange(self.N)) * 3)
def test_small_dataset(self): m = ExternalMerger(self.agg, 1000) m.mergeValues(self.data) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(xrange(self.N))) m = ExternalMerger(self.agg, 1000) m.mergeCombiners(map(lambda x_y1: (x_y1[0], [x_y1[1]]), self.data)) self.assertEqual(m.spills, 0) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(xrange(self.N)))
def test_huge_dataset(self): m = ExternalMerger(self.agg, 5, partitions=3) m.mergeCombiners( map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(len(v) for k, v in m.items()), self.N * 10) m._cleanup()
def test_huge_dataset(self): m = ExternalMerger(self.agg, 5, partitions=3) m.mergeCombiners(map(lambda k_v: (k_v[0], [str(k_v[1])]), self.data * 10)) self.assertTrue(m.spills >= 1) self.assertEqual(sum(len(v) for k, v in m.items()), self.N * 10) m._cleanup()
def test_simple_aggregator_with_medium_dataset(self): # SPARK-39179: Test Simple aggregator agg = SimpleAggregator(lambda x, y: x + y) m = ExternalMerger(agg, 20) m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(v for k, v in m.items()), sum(range(self.N)))
def test_shuffle_data_with_multiple_locations(self): # SPARK-39179: Test shuffle of data with multiple location also check # shuffle locations get randomized with tempfile.TemporaryDirectory( ) as tempdir1, tempfile.TemporaryDirectory() as tempdir2: original = os.environ.get("SPARK_LOCAL_DIRS", None) os.environ["SPARK_LOCAL_DIRS"] = tempdir1 + "," + tempdir2 try: index_of_tempdir1 = [False, False] for idx in range(10): m = ExternalMerger(self.agg, 20) if m.localdirs[0].startswith(tempdir1): index_of_tempdir1[0] = True elif m.localdirs[1].startswith(tempdir1): index_of_tempdir1[1] = True m.mergeValues(self.data) self.assertTrue(m.spills >= 1) self.assertEqual(sum(sum(v) for k, v in m.items()), sum(range(self.N))) self.assertTrue(index_of_tempdir1[0] and (index_of_tempdir1[0] == index_of_tempdir1[1])) finally: if original is not None: os.environ["SPARK_LOCAL_DIRS"] = original else: del os.environ["SPARK_LOCAL_DIRS"]