def test_partition_id(self): """Test the partition id.""" rdd1 = self.sc.parallelize(range(10), 1) rdd2 = self.sc.parallelize(range(10), 2) pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect() pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect() self.assertEqual(0, pids1[0]) self.assertEqual(0, pids1[9]) self.assertEqual(0, pids2[0]) self.assertEqual(1, pids2[9])
def test_stage_id(self): """Test the stage ids are available and incrementing as expected.""" rdd = self.sc.parallelize(range(10)) stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0] # Test using the constructor directly rather than the get() stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0] self.assertEqual(stage1 + 1, stage2) self.assertEqual(stage1 + 2, stage3) self.assertEqual(stage2 + 1, stage3)
def test_get_local_property(self): """Verify that local properties set on the driver are available in TaskContext.""" key = "testkey" value = "testvalue" self.sc.setLocalProperty(key, value) try: rdd = self.sc.parallelize(range(1), 1) prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0] self.assertEqual(prop1, value) prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0] self.assertTrue(prop2 is None) finally: self.sc.setLocalProperty(key, None)
def fn(rows): import math import tensorflow as tf import tensorflow.keras.backend as K if GPU_INFERENCE_ENABLED: from pyspark import TaskContext config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = TaskContext.get().resources()['gpu'].addresses[0] K.set_session(tf.Session(config=config)) else: # Do not use GPUs for prediction, use single CPU core per task. config = tf.ConfigProto(device_count={'GPU': 0}) config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 K.set_session(tf.Session(config=config)) # Restore from checkpoint. model = deserialize_model(model_bytes, tf.keras.models.load_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Convert from log domain to real Sales numbers. log_sales = model.predict_on_batch([[row[col]] for col in all_cols])[0] # Add 'Sales' column with prediction results. fields['Sales'] = math.exp(log_sales) yield Row(**fields)
def test_attempt_number(self): """Verify the attempt numbers are correctly reported.""" rdd = self.sc.parallelize(range(10)) # Verify a simple job with no failures attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect() map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers) def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id] result = rdd.map(fail_on_first).collect() # We should re-submit the first partition to it but other partitions should be attempt 0 self.assertEqual([0, 0, 1], result[0][0:3]) self.assertEqual([9, 3, 0], result[9][0:3]) first_partition = filter(lambda x: x[1] == 0, result) map(lambda x: self.assertEqual(1, x[2]), first_partition) other_partitions = filter(lambda x: x[1] != 0, result) map(lambda x: self.assertEqual(0, x[2]), other_partitions) # The task attempt id should be different self.assertTrue(result[0][3] != result[9][3])
def _process_partition(messages): offset = offsets[TaskContext.get().partitionId()] result = defaultdict(float) for (_, message) in messages: price = sum(item['total_price_paid'] for item in message['items']) result[message['store_id']] += price engine = create_engine(url) # avoid transactional deadlock result = sorted(result.iteritems()) with engine.begin() as conn: for store_id, price in result: conn.execute(text(SALES_UPSERT_QUERY), store_id=store_id, date=timestr, total_sales_price=price) conn.execute(text(OFFSET_UPSERT_QEURY), topic=offset.topic, partition=offset.partition, offset=offset.untilOffset) return [len(result)]
def context(iterator): tp = TaskContext.get().partitionId() try: bp = BarrierTaskContext.get().partitionId() except Exception: bp = -1 yield (tp, bp, os.getpid())
def f(iterator): taskContext = TaskContext.get() if isinstance(taskContext, BarrierTaskContext): yield taskContext.partitionId() + 1 elif isinstance(taskContext, TaskContext): yield taskContext.partitionId() + 2 else: yield -1
def _get_property_from_spark_context(key): try: from pyspark import TaskContext # pylint: disable=import-error task_context = TaskContext.get() if task_context: return task_context.getLocalProperty(key) except Exception: # pylint: disable=broad-except return None
def test_resources(self): """Test the resources are empty by default.""" rdd = self.sc.parallelize(range(10)) resources1 = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0] # Test using the constructor directly rather than the get() resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0] self.assertEqual(len(resources1), 0) self.assertEqual(len(resources2), 0)
def test_resources(self): """Test the resources are available.""" rdd = self.sc.parallelize(range(10)) resources = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0] self.assertEqual(len(resources), 1) self.assertTrue('gpu' in resources) self.assertEqual(resources['gpu'].name, 'gpu') self.assertEqual(resources['gpu'].addresses, ['0'])
def fail_on_first(x): """Fail on the first attempt so we get a positive attempt number""" tc = TaskContext.get() attempt_number = tc.attemptNumber() partition_id = tc.partitionId() attempt_id = tc.taskAttemptId() if attempt_number == 0 and partition_id == 0: raise Exception("Failing on first attempt") else: return [x, partition_id, attempt_number, attempt_id]
def get_partition_attempt_id(): """Returns partitionId and attemptNumber of the task context, when invoked on a spark executor. PartitionId is ID of the RDD partition that is computed by this task. The first task attempt will be assigned attemptNumber = 0, and subsequent attempts will have increasing attempt numbers. Returns: partitionId, attemptNumber -- [description] """ task_context = TaskContext.get() return task_context.partitionId(), task_context.attemptNumber()
def _transform_to_slices(rdds): taskcontext = TaskContext.get() partitionid = taskcontext.partitionId() csv = pd.DataFrame(list(rdds), columns=CSV_ORDERED_COLUMNS) num_rows = len(csv.index) print("working with partition: ", partitionid, max_partition_num, num_rows) examples = [] for start_ind in range( 0, num_rows, batch_size if batch_size is not None else 1): # for each batch if start_ind + batch_size - 1 > num_rows: # if we'd run out of rows csv_slice = csv.iloc[start_ind:] print("last Example has: ", len(csv_slice), partitionid) examples.append((csv_slice, len(csv_slice))) return examples else: csv_slice = csv.iloc[start_ind:start_ind + (batch_size if batch_size is not None else 1)] examples.append((csv_slice, len(csv_slice))) return examples
def test_cpus(self): """Test the cpus are available.""" rdd = self.sc.parallelize(range(10)) cpus = rdd.map(lambda x: TaskContext.get().cpus()).take(1)[0] self.assertEqual(cpus, 2)
def test_tc_on_driver(self): """Verify that getting the TaskContext on the driver returns None.""" tc = TaskContext.get() self.assertTrue(tc is None)
def _get_gpus(cluster_spec=None): gpus = [] is_k8s = 'SPARK_EXECUTOR_POD_IP' in os.environ # handle explicitly configured tf_args.num_gpus if 'num_gpus' in tf_args: requested_gpus = tf_args.num_gpus user_requested = True else: requested_gpus = 0 user_requested = False # first, try Spark 3 resources API, returning all visible GPUs # note: num_gpus arg is only used (if supplied) to limit/truncate visible devices if _has_spark_resource_api(): from pyspark import TaskContext context = TaskContext.get() if context: resources = context.resources() if resources and 'gpu' in resources: # get all GPUs assigned by resource manager gpus = context.resources()['gpu'].addresses logger.info("Spark gpu resources: {}".format(gpus)) if user_requested: if requested_gpus < len(gpus): # override/truncate list, if explicitly configured logger.warn( "Requested {} GPU(s), but {} available". format(requested_gpus, len(gpus))) gpus = gpus[:requested_gpus] else: # implicitly requested by Spark 3 requested_gpus = len(gpus) # if not in K8s pod and GPUs available, just use original allocation code (defaulting to 1 GPU if available) # Note: for K8s, there is a bug with the Nvidia device_plugin which can show GPUs for non-GPU pods that are hosted on GPU nodes if not is_k8s and gpu_info.is_gpu_available() and not gpus: # default to one GPU if not specified explicitly requested_gpus = max( 1, requested_gpus) if not user_requested else requested_gpus if requested_gpus > 0: if cluster_spec: # compute my index relative to other nodes on the same host (for GPU allocation) my_addr = cluster_spec[job_name][task_index] my_host = my_addr.split(':')[0] flattened = [ v for sublist in cluster_spec.values() for v in sublist ] local_peers = [ p for p in flattened if p.startswith(my_host) ] my_index = local_peers.index(my_addr) else: my_index = 0 # try to allocate a GPU gpus = gpu_info.get_gpus(requested_gpus, my_index, format=gpu_info.AS_LIST) if user_requested and len(gpus) < requested_gpus: raise Exception( "Unable to allocate {} GPU(s) from available GPUs: {}". format(requested_gpus, gpus)) gpus_to_use = ','.join(gpus) if gpus: logger.info( "Requested {} GPU(s), setting CUDA_VISIBLE_DEVICES={}". format(requested_gpus if user_requested else len(gpus), gpus_to_use)) os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use
def _get_resources(self): if LooseVersion(pyspark.__version__) >= LooseVersion('3.0.0'): from pyspark import TaskContext return TaskContext.get().resources() return dict()
def _get_spark_task_context_or_none(): try: from pyspark import TaskContext # pylint: disable=import-error return TaskContext.get() except ImportError: return None
print("My custom profiles for RDD:%s" % id) conf = SparkConf().set("spark.python.profile", "true") sc = SparkContext('local', 'test', conf=conf, profiler_cls=MyCustomProfiler) sc.parallelize(range(1000)).map(lambda x: 2 * x).take(10) sc.parallelize(range(1000)).count() sc.show_profiles() # My custom profiles for RDD:1 # My custom profiles for RDD:3 sc.stop() print( "-----TaskContext-----RDDBarrier----BarrierTaskContext----BarrierTaskInfo--------------" ) tc = TaskContext.get() # 返回当前活动的TaskContext。可以在用户函数内部调用它,以访问有关正在运行的任务的上下文信息。 if tc: print(tc.attemptNumber()) print(tc.getLocalProperty("key")) print(tc.partitionId()) print(tc.resources()) print(tc.stageId()) print(tc.taskAttemptId()) # RDDBarrier(实验性的) 用屏障包装RDD以实现屏障执行。 屏障调度器barrier scheduling # Spark 为了支持深度学习而引入的屏障调度器 b = rdd.barrier() # b = RDDBarrier(rdd) # b.mapPartitions() # b.mapPartitionsWithIndex() # bt = BarrierTaskContext.get()