Пример #1
0
def task_info(*_):
    ctx = TaskContext()
    return [
        "Stage: {0}, Partition: {1}, Host: {2}".format(ctx.stageId(),
                                                       ctx.partitionId(),
                                                       socket.gethostname())
    ]
def process_spark_partitions(partition):
    """

    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partition partitionId=" + str(ctx.partitionId()))

    big_taxo = TaxonomyWrapper.get(args, SERVICE_PRINCIPAL_SECRET, logger)
    gensim_model = GensimMagic.get(args, SERVICE_PRINCIPAL_SECRET, logger)  # move this to process_partitions
    de_vocab = gensim_model["vocab"]  # move this to process_partitions
    de_model = gensim_model["model"]  # move this to process_partitions

    words_list = set(de_vocab.keys())
    for domain, domain_dict in big_taxo.items():
        words_list = words_list.union(set(domain_dict.keys()))

    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry, big_taxo, de_model, de_vocab, words_list))
    logger.info(f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records")
    return all_records
Пример #3
0
 def test_partition_id(self):
     """Test the partition id."""
     rdd1 = self.sc.parallelize(range(10), 1)
     rdd2 = self.sc.parallelize(range(10), 2)
     pids1 = rdd1.map(lambda x: TaskContext.get().partitionId()).collect()
     pids2 = rdd2.map(lambda x: TaskContext.get().partitionId()).collect()
     self.assertEqual(0, pids1[0])
     self.assertEqual(0, pids1[9])
     self.assertEqual(0, pids2[0])
     self.assertEqual(1, pids2[9])
Пример #4
0
 def test_stage_id(self):
     """Test the stage ids are available and incrementing as expected."""
     rdd = self.sc.parallelize(range(10))
     stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     # Test using the constructor directly rather than the get()
     stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
     self.assertEqual(stage1 + 1, stage2)
     self.assertEqual(stage1 + 2, stage3)
     self.assertEqual(stage2 + 1, stage3)
Пример #5
0
 def test_get_local_property(self):
     """Verify that local properties set on the driver are available in TaskContext."""
     key = "testkey"
     value = "testvalue"
     self.sc.setLocalProperty(key, value)
     try:
         rdd = self.sc.parallelize(range(1), 1)
         prop1 = rdd.map(lambda _: TaskContext.get().getLocalProperty(key)).collect()[0]
         self.assertEqual(prop1, value)
         prop2 = rdd.map(lambda _: TaskContext.get().getLocalProperty("otherkey")).collect()[0]
         self.assertTrue(prop2 is None)
     finally:
         self.sc.setLocalProperty(key, None)
Пример #6
0
    def test_attempt_number(self):
        """Verify the attempt numbers are correctly reported."""
        rdd = self.sc.parallelize(range(10))
        # Verify a simple job with no failures
        attempt_numbers = rdd.map(lambda x: TaskContext.get().attemptNumber()).collect()
        map(lambda attempt: self.assertEqual(0, attempt), attempt_numbers)

        def fail_on_first(x):
            """Fail on the first attempt so we get a positive attempt number"""
            tc = TaskContext.get()
            attempt_number = tc.attemptNumber()
            partition_id = tc.partitionId()
            attempt_id = tc.taskAttemptId()
            if attempt_number == 0 and partition_id == 0:
                raise Exception("Failing on first attempt")
            else:
                return [x, partition_id, attempt_number, attempt_id]
        result = rdd.map(fail_on_first).collect()
        # We should re-submit the first partition to it but other partitions should be attempt 0
        self.assertEqual([0, 0, 1], result[0][0:3])
        self.assertEqual([9, 3, 0], result[9][0:3])
        first_partition = filter(lambda x: x[1] == 0, result)
        map(lambda x: self.assertEqual(1, x[2]), first_partition)
        other_partitions = filter(lambda x: x[1] != 0, result)
        map(lambda x: self.assertEqual(0, x[2]), other_partitions)
        # The task attempt id should be different
        self.assertTrue(result[0][3] != result[9][3])
def process_spark_partitions(partition):
    """
    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partitionId=" + str(ctx.partitionId()))
    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry))
    logger.info(
        f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records"
    )
    return all_records
Пример #8
0
 def fail_on_first(x):
     """Fail on the first attempt so we get a positive attempt number"""
     tc = TaskContext.get()
     attempt_number = tc.attemptNumber()
     partition_id = tc.partitionId()
     attempt_id = tc.taskAttemptId()
     if attempt_number == 0 and partition_id == 0:
         raise Exception("Failing on first attempt")
     else:
         return [x, partition_id, attempt_number, attempt_id]
Пример #9
0
def save_spark_pandas_to_parquet(output, out_dir):
    from pyspark import TaskContext

    ctx = TaskContext()
    name = f"part_{ctx.partitionId()}"
    # print("Stage: {0}, Partition: {1}, Host: {2}".format(
    #     ctx.stageId(), ctx.partitionId(), socket.gethostname()))

    for ds in output.dataset.unique():
        df = output[output.dataset == ds]
        if df.shape[0] == 0:
            return
        mkdir(f"{out_dir}/{ds}")
        path = f"{out_dir}/{ds}/{name}.parquet"
        df.to_parquet(path=path)
        print(f"Saved to {path}")
Пример #10
0
 def test_tc_on_driver(self):
     """Verify that getting the TaskContext on the driver returns None."""
     tc = TaskContext.get()
     self.assertTrue(tc is None)
Пример #11
0
from pyspark import SparkContext
from pyspark import TaskContext

if __name__ == '__main__':
    sc = SparkContext()
    tc = TaskContext()

    rdd = sc.parallelize(["这", "是", "一", "首", "简", "单", "的", "小", "情", "歌"], 3)

    # 与map类似,map是作用于每个元素,而 mapPartitions 是作用于每个分区
    # mapPatririons 的函数参数和返回值的类型都应该是 iterator
    def f(iter):
        yield "".join(iter) + str(tc.partitionId())

    mapPartitions_rdd = rdd.mapPartitions(f)

    print(mapPartitions_rdd.collect())  # ['这是一0', '首简单1', '的小情歌2']