from pyspark.sql import SQLContext from imutils import face_utils from scipy.spatial import distance import numpy as np import imutils import dlib import cv2 import time conf = SparkConf().setAppName("drowsy streaming").setMaster("yarn") conf.set("spark.scheduler.mode", "FAIR") conf.set("spark.scheduler.allocation.file", "/opt/spark-2.4.3-bin-hadoop2.7/conf/fairscheduler.xml") sc = SparkContext(conf=conf) sc.setLocalProperty("spark.scheduler.pool", "pool1") ssc = StreamingContext(sc, 0.5) sql_sc = SQLContext(sc) input_topic = 'input' output_topic = 'output1' brokers = "G01-01:2181,G01-02:2181,G01-03:2181,G01-04:2181,G01-05:2181,G01-06:2181,G01-07:2181,G01-08:2181," \ "G01-09:2181,G01-10:2181,G01-11:2181,G01-12:2181,G01-13:2181,G01-14:2181,G01-15:2181,G01-16:2181" predictor_path = "/home/hduser/DrunkDetection/shape_predictor_68_face_landmarks.dat" def my_decoder(s): return s def eye_aspect_ratio(eye): A = distance.euclidean(eye[1], eye[5])
class TaskContextTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ # Allow retries even though they are normally disabled in local mode self.sc = SparkContext('local[4, 2]', class_name) def test_stage_id(self): """Test the stage ids are available and incrementing as expected.""" rdd = self.sc.parallelize(range(10)) stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] # Test using the constructor directly rather than the get() stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] self.assertEqual(stage1 + 1, stage2) self.assertEqual(stage1 + 2, stage3) self.assertEqual(stage2 + 1, stage3) def test_resources(self): """Test the resources are empty by default.""" rdd = self.sc.parallelize(range(10)) resources1 = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0] # Test using the constructor directly rather than the get() resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0] self.assertEqual(len(resources1), 0) self.assertEqual(len(resources2), 0) def test_partition_id(self): """Test the partition id.""" rdd1 = self.sc.parallelize(range(10), 1) rdd2 = self.sc.parallelize(range(10), 2) pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() self.assertEqual(0, pids1[0]) self.assertEqual(0, pids1[9]) self.assertEqual(0, pids2[0]) self.assertEqual(1, pids2[9]) def test_attempt_number(self): """Verify the attempt numbers are correctly reported.""" rdd = self.sc.parallelize(range(10)) # Verify a simple job with no failures attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect() map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id] result = rdd.map(fail_on_first).collect() # We should re-submit the first partition to it but other partitions should be attempt 0 self.assertEqual([0, 0, 1], result[0][0:3]) self.assertEqual([9, 3, 0], result[9][0:3]) first_partition = filter(lambda x: x[1] == 0, result) map(lambda x: self.assertEqual(1, x[2]), first_partition) other_partitions = filter(lambda x: x[1] != 0, result) map(lambda x: self.assertEqual(0, x[2]), other_partitions) # The task attempt id should be different self.assertTrue(result[0][3] != result[9][3]) def test_tc_on_driver(self): """Verify that getting the TaskContext on the driver returns None.""" tc = TaskContext.get() self.assertTrue(tc is None) def test_get_local_property(self): """Verify that local properties set on the driver are available in TaskContext.""" key = "testkey" value = "testvalue" self.sc.setLocalProperty(key, value) try: rdd = self.sc.parallelize(range(1), 1) prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] self.assertEqual(prop1, value) prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0] self.assertTrue(prop2 is None) finally: self.sc.setLocalProperty(key, None) def test_barrier(self): """ Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks within a stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) def context_barrier(x): tc = BarrierTaskContext.get() time.sleep(random.randint(1, 10)) tc.barrier() return time.time() times = rdd.barrier().mapPartitions(f).map(context_barrier).collect() self.assertTrue(max(times) - min(times) < 1) def test_all_gather(self): """ Verify that BarrierTaskContext.allGather() performs global sync among all barrier tasks within a stage and passes messages properly. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) def context_barrier(x): tc = BarrierTaskContext.get() time.sleep(random.randint(1, 10)) out = tc.allGather(str(tc.partitionId())) pids = [int(e) for e in out] return pids pids = rdd.barrier().mapPartitions(f).map(context_barrier).collect()[0] self.assertEqual(pids, [0, 1, 2, 3]) def test_barrier_infos(self): """ Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) taskInfos = rdd.barrier().mapPartitions(f).map(lambda x: BarrierTaskContext.get() .getTaskInfos()).collect() self.assertTrue(len(taskInfos) == 4) self.assertTrue(len(taskInfos[0]) == 4) def test_context_get(self): """ Verify that TaskContext.get() works both in or not in a barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): taskContext = TaskContext.get() if isinstance(taskContext, BarrierTaskContext): yield taskContext.partitionId() + 1 elif isinstance(taskContext, TaskContext): yield taskContext.partitionId() + 2 else: yield -1 # for normal stage result1 = rdd.mapPartitions(f).collect() self.assertTrue(result1 == [2, 3, 4, 5]) # for barrier stage result2 = rdd.barrier().mapPartitions(f).collect() self.assertTrue(result2 == [1, 2, 3, 4]) def test_barrier_context_get(self): """ Verify that BarrierTaskContext.get() should only works in a barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): try: taskContext = BarrierTaskContext.get() except Exception: yield -1 else: yield taskContext.partitionId() # for normal stage result1 = rdd.mapPartitions(f).collect() self.assertTrue(result1 == [-1, -1, -1, -1]) # for barrier stage result2 = rdd.barrier().mapPartitions(f).collect() self.assertTrue(result2 == [0, 1, 2, 3])
class TaskContextTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ # Allow retries even though they are normally disabled in local mode self.sc = SparkContext('local[4, 2]', class_name) def test_stage_id(self): """Test the stage ids are available and incrementing as expected.""" rdd = self.sc.parallelize(range(10)) stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] # Test using the constructor directly rather than the get() stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] self.assertEqual(stage1 + 1, stage2) self.assertEqual(stage1 + 2, stage3) self.assertEqual(stage2 + 1, stage3) def test_partition_id(self): """Test the partition id.""" rdd1 = self.sc.parallelize(range(10), 1) rdd2 = self.sc.parallelize(range(10), 2) pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() self.assertEqual(0, pids1[0]) self.assertEqual(0, pids1[9]) self.assertEqual(0, pids2[0]) self.assertEqual(1, pids2[9]) def test_attempt_number(self): """Verify the attempt numbers are correctly reported.""" rdd = self.sc.parallelize(range(10)) # Verify a simple job with no failures attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect() map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id] result = rdd.map(fail_on_first).collect() # We should re-submit the first partition to it but other partitions should be attempt 0 self.assertEqual([0, 0, 1], result[0][0:3]) self.assertEqual([9, 3, 0], result[9][0:3]) first_partition = filter(lambda x: x[1] == 0, result) map(lambda x: self.assertEqual(1, x[2]), first_partition) other_partitions = filter(lambda x: x[1] != 0, result) map(lambda x: self.assertEqual(0, x[2]), other_partitions) # The task attempt id should be different self.assertTrue(result[0][3] != result[9][3]) def test_tc_on_driver(self): """Verify that getting the TaskContext on the driver returns None.""" tc = TaskContext.get() self.assertTrue(tc is None) def test_get_local_property(self): """Verify that local properties set on the driver are available in TaskContext.""" key = "testkey" value = "testvalue" self.sc.setLocalProperty(key, value) try: rdd = self.sc.parallelize(range(1), 1) prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] self.assertEqual(prop1, value) prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0] self.assertTrue(prop2 is None) finally: self.sc.setLocalProperty(key, None) def test_barrier(self): """ Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks within a stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) def context_barrier(x): tc = BarrierTaskContext.get() time.sleep(random.randint(1, 10)) tc.barrier() return time.time() times = rdd.barrier().mapPartitions(f).map(context_barrier).collect() self.assertTrue(max(times) - min(times) < 1) def test_barrier_with_python_worker_reuse(self): """ Verify that BarrierTaskContext.barrier() with reused python worker. """ self.sc._conf.set("spark.python.work.reuse", "true") rdd = self.sc.parallelize(range(4), 4) # start a normal job first to start all worker result = rdd.map(lambda x: x ** 2).collect() self.assertEqual([0, 1, 4, 9], result) # make sure `spark.python.work.reuse=true` self.assertEqual(self.sc._conf.get("spark.python.work.reuse"), "true") # worker will be reused in this barrier job self.test_barrier() def test_barrier_infos(self): """ Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) taskInfos = rdd.barrier().mapPartitions(f).map(lambda x: BarrierTaskContext.get() .getTaskInfos()).collect() self.assertTrue(len(taskInfos) == 4) self.assertTrue(len(taskInfos[0]) == 4)
prg = cl.Program(ctx, KERNEL_CODE).build() kernel = prg.Mul2 mf = cl.mem_flags print "map" + str(data) np_data = [] data_buf = [] np_data.append(np.array(data[0]).astype(np.float32)) data_buf.append(cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_data[0])) result = np.zeros((5,)).astype(np.float32) result_buf = cl.Buffer(ctx, mf.WRITE_ONLY, result.nbytes) kernel(queue,(5,),None,data_buf[0],result_buf) cl.enqueue_read_buffer(queue, result_buf, result).wait() return [result.astype(np.float32)] if __name__ == "__main__" : try: conf = SparkConf().setAppName("MyApp").setMaster("spark://10.20.21.194:7077") conf.set("spark.scheduler.allocation.file", "conf/fairscheduler.xml") sc = SparkContext(conf=conf) sc.setLocalProperty("spark.scheduler.pool", "pool1") distData = sc.parallelize([[[[1,2,3,4,5],[1,2,3,4,5]]]]).map(map1).map(map2) result = distData.collect() for r in result : print r[0].tolist() except: raise
class TaskContextTests(PySparkTestCase): def setUp(self): self._old_sys_path = list(sys.path) class_name = self.__class__.__name__ # Allow retries even though they are normally disabled in local mode self.sc = SparkContext('local[4, 2]', class_name) def test_stage_id(self): """Test the stage ids are available and incrementing as expected.""" rdd = self.sc.parallelize(range(10)) stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] # Test using the constructor directly rather than the get() stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] self.assertEqual(stage1 + 1, stage2) self.assertEqual(stage1 + 2, stage3) self.assertEqual(stage2 + 1, stage3) def test_partition_id(self): """Test the partition id.""" rdd1 = self.sc.parallelize(range(10), 1) rdd2 = self.sc.parallelize(range(10), 2) pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() self.assertEqual(0, pids1[0]) self.assertEqual(0, pids1[9]) self.assertEqual(0, pids2[0]) self.assertEqual(1, pids2[9]) def test_attempt_number(self): """Verify the attempt numbers are correctly reported.""" rdd = self.sc.parallelize(range(10)) # Verify a simple job with no failures attempt_numbers = rdd.map( lambda x: TaskContext.get().attemptNumber()).collect() map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id] result = rdd.map(fail_on_first).collect() # We should re-submit the first partition to it but other partitions should be attempt 0 self.assertEqual([0, 0, 1], result[0][0:3]) self.assertEqual([9, 3, 0], result[9][0:3]) first_partition = filter(lambda x: x[1] == 0, result) map(lambda x: self.assertEqual(1, x[2]), first_partition) other_partitions = filter(lambda x: x[1] != 0, result) map(lambda x: self.assertEqual(0, x[2]), other_partitions) # The task attempt id should be different self.assertTrue(result[0][3] != result[9][3]) def test_tc_on_driver(self): """Verify that getting the TaskContext on the driver returns None.""" tc = TaskContext.get() self.assertTrue(tc is None) def test_get_local_property(self): """Verify that local properties set on the driver are available in TaskContext.""" key = "testkey" value = "testvalue" self.sc.setLocalProperty(key, value) try: rdd = self.sc.parallelize(range(1), 1) prop1 = rdd.map( lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] self.assertEqual(prop1, value) prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty( "otherkey")).collect()[0] self.assertTrue(prop2 is None) finally: self.sc.setLocalProperty(key, None) def test_barrier(self): """ Verify that BarrierTaskContext.barrier() performs global sync among all barrier tasks within a stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) def context_barrier(x): tc = BarrierTaskContext.get() time.sleep(random.randint(1, 10)) tc.barrier() return time.time() times = rdd.barrier().mapPartitions(f).map(context_barrier).collect() self.assertTrue(max(times) - min(times) < 1) def test_barrier_with_python_worker_reuse(self): """ Verify that BarrierTaskContext.barrier() with reused python worker. """ self.sc._conf.set("spark.python.work.reuse", "true") rdd = self.sc.parallelize(range(4), 4) # start a normal job first to start all worker result = rdd.map(lambda x: x**2).collect() self.assertEqual([0, 1, 4, 9], result) # make sure `spark.python.work.reuse=true` self.assertEqual(self.sc._conf.get("spark.python.work.reuse"), "true") # worker will be reused in this barrier job self.test_barrier() def test_barrier_infos(self): """ Verify that BarrierTaskContext.getTaskInfos() returns a list of all task infos in the barrier stage. """ rdd = self.sc.parallelize(range(10), 4) def f(iterator): yield sum(iterator) taskInfos = rdd.barrier().mapPartitions(f).map( lambda x: BarrierTaskContext.get().getTaskInfos()).collect() self.assertTrue(len(taskInfos) == 4) self.assertTrue(len(taskInfos[0]) == 4)
def init_spark_context(): appConfig = conf.Config(exec_cores=8, cores_max=24, yarn_cores=2, instances=6, queue='root.world-bdp-srm-analysts') sc = SparkContext(conf=appConfig.setSparkConf()) # set resource pool sc.setLocalProperty("spark.scheduler.pool", "default") return sc