Python RDD.getNumPartitions 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pyspark

클래스/타입: RDD

메소드/함수: getNumPartitions

hotexamples.com에서의 예제들: 9

Python RDD.getNumPartitions - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pyspark.RDD.getNumPartitions에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

예제 #1

파일 보기

def _union(rdd: RDD, other: RDD, func):
    num_partition = max(rdd.getNumPartitions(), other.getNumPartitions())

    def _func(pair):
        iter1, iter2 = pair
        val1 = list(iter1)
        val2 = list(iter2)
        if not val1:
            return val2[0]
        if not val2:
            return val1[0]
        return func(val1[0], val2[0])

    return _map_value(rdd.cogroup(other, num_partition), _func)

예제 #2

파일 보기

 def from_rdd(cls, rdd: RDD, job_id: str, namespace: str, name: str):
     partitions = rdd.getNumPartitions()
     return RDDTable(session_id=job_id,
                     namespace=namespace,
                     name=name,
                     partitions=partitions,
                     rdd=rdd)

예제 #3

파일 보기

 def _tmp_table_from_rdd(self, rdd: RDD, name=None):
     """
     tmp table, with namespace == job_id
     """
     rdd = materialize(rdd)
     name = name or str(uuid.uuid1())
     return RDDTable(session_id=self._session_id,
                     namespace=self._namespace,
                     name=name,
                     partitions=rdd.getNumPartitions(),
                     rdd=rdd,
                     dtable=None)

예제 #4

파일 보기

파일: neighborhood_based_cf.py 프로젝트: Yuhan-Wg/massive-data-mining

    def _check_data(train: RDD = None, test: RDD = None) -> (RDD, int):
        # Data-type check
        if isinstance(train, RDD):
            is_legal_train = train.map(
                lambda u: len(u) >= 3 and u[0] is not None and u[1] is not None
                and isinstance(u[2], Number)).reduce(lambda u1, u2: u1 and u2)
            if not is_legal_train:
                raise ValueError(
                    "Parameter train should be an RDD<(user, item, rating)>")
            num_partitions_of_train = train.getNumPartitions()
            return train

        if isinstance(test, RDD):
            is_legal_test = test.map(lambda u: len(u) >= 2 and u[0] is not None
                                     and u[1] is not None).reduce(
                                         lambda u1, u2: u1 and u2)
            if not is_legal_test:
                raise ValueError(
                    "Parameter train should be an RDD<(user, item, rating)>")
            num_partitions_of_test = test.getNumPartitions()
            return test

        raise ValueError("RDD train/test need to be input.")

예제 #5

파일 보기

파일: pyspark_utils.py 프로젝트: JayYip/bert-multitask-learning

def repar_rdd(rdd: RDD,
              rdd_count: int,
              example_per_par=100000,
              coalesce_only=True):
    """
    repar rdd based on number of example. if coalesce_only is False and expected
    partition is greater than current partition then nothing will happen
    """
    num_partition = rdd.getNumPartitions()
    expect_partition = max(1, int(rdd_count / example_per_par))

    if expect_partition < num_partition:
        rdd = rdd.coalesce(expect_partition)
    elif expect_partition > num_partition and coalesce_only is False:
        rdd = rdd.repartition(expect_partition)

    return rdd

예제 #6

파일 보기

파일: algorithm.py 프로젝트: kowaalczyk/spark-minimal-algorithms

    def __call__(self, rdd: RDD, **kwargs: Any) -> RDD:
        """
        Performs a single step of an algorithm, running all operations in sequence
        and ensuring data is partitioned correctly.

        Any additional keyword arguments passed to this function will be available
        in all life-cycle functions of the step:
        - `group`
        - `emit_by_group`
        - `broadcast`
        - `step`

        **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.**
        """
        if rdd.getNumPartitions() != self._n_partitions:
            rdd = rdd.repartition(self._n_partitions)

        step_cls: Type[Step] = self.__class__
        rdd = step_cls.group(
            rdd, **kwargs
        ).cache()  # cache because we use it twice (emit and step)

        def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]:
            k, v = kv
            new_v = step_cls.emit_by_group(k, v, **kwargs)
            return new_v

        emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect())
        to_broadcast = step_cls.broadcast(emitted, **kwargs)
        broadcast: Broadcast = self._sc.broadcast(to_broadcast)

        def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]:
            k, v = kv
            for new_v in step_cls.step(k, v, broadcast, **kwargs):
                yield new_v

        rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True)
        return rdd

예제 #7

파일 보기

파일: partition.py 프로젝트: yang-zhang-work/fugue

def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD:
    """
    Modified from
    https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py

    """
    starts = [0]
    if rdd.getNumPartitions() > 1:
        nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect()
        for i in range(len(nums) - 1):
            starts.append(starts[-1] + nums[i])

    def func1(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield i, v

    def func2(k, it):  # pragma: no cover
        for i, v in enumerate(it, starts[k]):
            yield list(v) + [i]

    if not to_rows:
        return rdd.mapPartitionsWithIndex(func1)
    else:
        return rdd.mapPartitionsWithIndex(func2)

예제 #8

파일 보기

def _join(rdd: RDD, other: RDD, func=None):
    num_partitions = max(rdd.getNumPartitions(), other.getNumPartitions())
    rtn_rdd = rdd.join(other, numPartitions=num_partitions)
    if func is not None:
        rtn_rdd = _map_value(rtn_rdd, lambda x: func(x[0], x[1]))
    return rtn_rdd

예제 #9

파일 보기

def _subtract_by_key(rdd: RDD, other: RDD):
    return rdd.subtractByKey(other, rdd.getNumPartitions())