class PysparkFlameTest(unittest.TestCase): def setUp(self): self.dumpdir = tempfile.mkdtemp() conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext('local[*]', 'test', conf=conf, profiler_cls=FlameProfiler, environment={'pyspark_flame.interval': 0.25}) def tearDown(self): self.sc.stop() shutil.rmtree(self.dumpdir) def test_pyspark_flame(self): self.sc.parallelize(range(4)).map(wait_a_bit).sum() self.sc.dump_profiles(self.dumpdir) dumps = os.listdir(self.dumpdir) self.assertEqual(1, len(dumps)) with open(os.path.join(self.dumpdir, dumps[0])) as dumpfile: for line in dumpfile.readlines(): location, count = line.split(' ') if 'pyspark_flame_test.py:wait_a_bit:11' in location: count = int(count) self.assertIn(count, range(70, 90)) return else: self.fail('No wait_a_bit profile line found') def test_propagate_exception(self): with self.assertRaises(Exception): self.sc.parallelize(range(4)).map(crash).sum()
class ProfilerTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext('local[4]', class_name, conf=conf) def test_profiler(self): self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) id, profiler, _ = profilers[0] stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] self.assertTrue("heavy_foo" in func_names) old_stdout = sys.stdout sys.stdout = io = StringIO() self.sc.show_profiles() self.assertTrue("heavy_foo" in io.getvalue()) sys.stdout = old_stdout d = tempfile.gettempdir() self.sc.dump_profiles(d) self.assertTrue("rdd_%d.pstats" % id in os.listdir(d)) def test_custom_profiler(self): class TestCustomProfiler(BasicProfiler): def show(self, id): self.result = "Custom formatting" self.sc.profiler_collector.profiler_cls = TestCustomProfiler self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) _, profiler, _ = profilers[0] self.assertTrue(isinstance(profiler, TestCustomProfiler)) self.sc.show_profiles() self.assertEqual("Custom formatting", profiler.result) def do_computation(self): def heavy_foo(x): for i in range(1 << 18): x = 1 rdd = self.sc.parallelize(range(100)) rdd.foreach(heavy_foo)
class ProfilerTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext('local[4]', class_name, conf=conf) def test_profiler(self): self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) id, profiler, _ = profilers[0] stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] self.assertTrue("heavy_foo" in func_names) old_stdout = sys.stdout sys.stdout = io = StringIO() self.sc.show_profiles() self.assertTrue("heavy_foo" in io.getvalue()) sys.stdout = old_stdout d = tempfile.gettempdir() self.sc.dump_profiles(d) self.assertTrue("rdd_%d.pstats" % id in os.listdir(d)) def test_custom_profiler(self): class TestCustomProfiler(BasicProfiler): def show(self, id): self.result = "Custom formatting" self.sc.profiler_collector.profiler_cls = TestCustomProfiler self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(1, len(profilers)) _, profiler, _ = profilers[0] self.assertTrue(isinstance(profiler, TestCustomProfiler)) self.sc.show_profiles() self.assertEqual("Custom formatting", profiler.result) def do_computation(self): def heavy_foo(x): for i in range(1 << 18): x = 1 rdd = self.sc.parallelize(range(100)) rdd.foreach(heavy_foo)
def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) finally: sc.stop()
def test_profiler_disabled(self): sc = SparkContext(conf=SparkConf().set("spark.python.profile", "false")) try: self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.show_profiles()) self.assertRaisesRegexp( RuntimeError, "'spark.python.profile' configuration must be set", lambda: sc.dump_profiles("/tmp/abc")) finally: sc.stop()
class UDFProfilerTests(unittest.TestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ conf = SparkConf().set("spark.python.profile", "true") self.sc = SparkContext("local[4]", class_name, conf=conf) self.spark = SparkSession.builder._sparkContext(self.sc).getOrCreate() def tearDown(self): self.spark.stop() sys.path = self._old_sys_path def test_udf_profiler(self): self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(3, len(profilers)) old_stdout = sys.stdout try: sys.stdout = io = StringIO() self.sc.show_profiles() finally: sys.stdout = old_stdout d = tempfile.gettempdir() self.sc.dump_profiles(d) for i, udf_name in enumerate(["add1", "add2", "add1"]): id, profiler, _ = profilers[i] with self.subTest(id=id, udf_name=udf_name): stats = profiler.stats() self.assertTrue(stats is not None) width, stat_list = stats.get_print_list([]) func_names = [func_name for fname, n, func_name in stat_list] self.assertTrue(udf_name in func_names) self.assertTrue(udf_name in io.getvalue()) self.assertTrue("udf_%d.pstats" % id in os.listdir(d)) def test_custom_udf_profiler(self): class TestCustomProfiler(UDFBasicProfiler): def show(self, id): self.result = "Custom formatting" self.sc.profiler_collector.udf_profiler_cls = TestCustomProfiler self.do_computation() profilers = self.sc.profiler_collector.profilers self.assertEqual(3, len(profilers)) _, profiler, _ = profilers[0] self.assertTrue(isinstance(profiler, TestCustomProfiler)) self.sc.show_profiles() self.assertEqual("Custom formatting", profiler.result) def do_computation(self): @udf def add1(x): return x + 1 @udf def add2(x): return x + 2 df = self.spark.range(10) df.select(add1("id"), add2("id"), add1("id")).collect()
from pyspark import SparkContext, SparkConf import numpy as np conf = SparkConf() conf.set('master', 'spark://hadoop-maste:7077') conf.set('spark.python.profile', 'true') context = SparkContext(conf=conf) rdd = context.parallelize(np.arange(10), 3) print(rdd.collect()) print(context.show_profiles()) context.dump_profiles('/datas/profiles/') context.stop()
import time import random from pyspark_flame import FlameProfiler from pyspark import SparkConf, SparkContext def multiply_inefficiently(x): for i in range(1000): time.sleep(0.0001 * random.random()) time.sleep(0.0001 * random.random()) return x * 2 conf = SparkConf().set("spark.python.profile", "true")#.set("spark.python.profile.dump", ".") sc = SparkContext('local', 'test', conf=conf, profiler_cls=FlameProfiler, environment={'pyspark_flame.interval': 0.25}) sc.parallelize(range(1000)).map(multiply_inefficiently).take(10) sc.show_profiles() sc.dump_profiles('.') sc.stop()