Python TaskContext примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark

Класс/Тип: TaskContext

Примеров на hotexamples.com: 11

Python TaskContext - 11 примеров найдено. Это лучшие примеры Python кода для pyspark.TaskContext, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

get(20)

TaskContext(12)

partitionId(5)

resources(2)

_getOrCreate(1)

stageId(1)

Пример #1

Показать файл

def task_info(*_):
    ctx = TaskContext()
    return [
        "Stage: {0}, Partition: {1}, Host: {2}".format(ctx.stageId(),
                                                       ctx.partitionId(),
                                                       socket.gethostname())
    ]

Пример #2

Показать файл

Файл: mail_enrichment_processor.py Проект: microsoftgraph/dataconnect-solutions

def process_spark_partitions(partition):
    """

    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partition partitionId=" + str(ctx.partitionId()))

    big_taxo = TaxonomyWrapper.get(args, SERVICE_PRINCIPAL_SECRET, logger)
    gensim_model = GensimMagic.get(args, SERVICE_PRINCIPAL_SECRET, logger)  # move this to process_partitions
    de_vocab = gensim_model["vocab"]  # move this to process_partitions
    de_model = gensim_model["model"]  # move this to process_partitions

    words_list = set(de_vocab.keys())
    for domain, domain_dict in big_taxo.items():
        words_list = words_list.union(set(domain_dict.keys()))

    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry, big_taxo, de_model, de_vocab, words_list))
    logger.info(f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records")
    return all_records

Пример #3

Показать файл

Файл: test_taskcontext.py Проект: JingchengDu/spark

 def test_partition_id(self):
     """Test the partition id."""
     rdd1 = self.sc.parallelize(range(10), 1)
     rdd2 = self.sc.parallelize(range(10), 2)
     pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
     pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
     self.assertEqual(0, pids1[0])
     self.assertEqual(0, pids1[9])
     self.assertEqual(0, pids2[0])
     self.assertEqual(1, pids2[9])

Пример #4

Показать файл

Файл: test_taskcontext.py Проект: JingchengDu/spark

 def test_stage_id(self):
     """Test the stage ids are available and incrementing as expected."""
     rdd = self.sc.parallelize(range(10))
     stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     # Test using the constructor directly rather than the get()
     stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
     self.assertEqual(stage1 + 1, stage2)
     self.assertEqual(stage1 + 2, stage3)
     self.assertEqual(stage2 + 1, stage3)

Пример #5

Показать файл

Файл: test_taskcontext.py Проект: JingchengDu/spark

 def test_get_local_property(self):
     """Verify that local properties set on the driver are available in TaskContext."""
     key = "testkey"
     value = "testvalue"
     self.sc.setLocalProperty(key, value)
     try:
         rdd = self.sc.parallelize(range(1), 1)
         prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
         self.assertEqual(prop1, value)
         prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
         self.assertTrue(prop2 is None)
     finally:
         self.sc.setLocalProperty(key, None)

Пример #6

Показать файл

Файл: test_taskcontext.py Проект: JingchengDu/spark

    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]
        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])

Пример #7

Показать файл

Файл: 01_calendar_spark_processor.py Проект: microsoftgraph/dataconnect-solutions

def process_spark_partitions(partition):
    """
    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partitionId=" + str(ctx.partitionId()))
    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry))
    logger.info(
        f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records"
    )
    return all_records

Пример #8

Показать файл

Файл: test_taskcontext.py Проект: JingchengDu/spark

 def fail_on_first(x):
     """Fail on the first attempt so we get a positive attempt number"""
     tc = TaskContext.get()
     attempt_number = tc.attemptNumber()
     partition_id = tc.partitionId()
     attempt_id = tc.taskAttemptId()
     if attempt_number == 0 and partition_id == 0:
         raise Exception("Failing on first attempt")
     else:
         return [x, partition_id, attempt_number, attempt_id]

Пример #9

Показать файл

def save_spark_pandas_to_parquet(output, out_dir):
    from pyspark import TaskContext

    ctx = TaskContext()
    name = f"part_{ctx.partitionId()}"
    # print("Stage: {0}, Partition: {1}, Host: {2}".format(
    #     ctx.stageId(), ctx.partitionId(), socket.gethostname()))

    for ds in output.dataset.unique():
        df = output[output.dataset == ds]
        if df.shape[0] == 0:
            return
        mkdir(f"{out_dir}/{ds}")
        path = f"{out_dir}/{ds}/{name}.parquet"
        df.to_parquet(path=path)
        print(f"Saved to {path}")

Пример #10

Показать файл

Файл: test_taskcontext.py Проект: JingchengDu/spark

 def test_tc_on_driver(self):
     """Verify that getting the TaskContext on the driver returns None."""
     tc = TaskContext.get()
     self.assertTrue(tc is None)

Пример #11

Показать файл

Файл: mapPartitions.py Проект: waterandair/daily-learning

from pyspark import SparkContext
from pyspark import TaskContext

if __name__ == '__main__':
    sc = SparkContext()
    tc = TaskContext()

    rdd = sc.parallelize(["这", "是", "一", "首", "简", "单", "的", "小", "情", "歌"], 3)

    # 与map类似，map是作用于每个元素，而 mapPartitions 是作用于每个分区
    # mapPatririons 的函数参数和返回值的类型都应该是 iterator
    def f(iter):
        yield "".join(iter) + str(tc.partitionId())

    mapPartitions_rdd = rdd.mapPartitions(f)

    print(mapPartitions_rdd.collect())  # ['这是一0', '首简单1', '的小情歌2']