def _union(rdd: RDD, other: RDD, func): num_partition = max(rdd.getNumPartitions(), other.getNumPartitions()) def _func(pair): iter1, iter2 = pair val1 = list(iter1) val2 = list(iter2) if not val1: return val2[0] if not val2: return val1[0] return func(val1[0], val2[0]) return _map_value(rdd.cogroup(other, num_partition), _func)
def from_rdd(cls, rdd: RDD, job_id: str, namespace: str, name: str): partitions = rdd.getNumPartitions() return RDDTable(session_id=job_id, namespace=namespace, name=name, partitions=partitions, rdd=rdd)
def _tmp_table_from_rdd(self, rdd: RDD, name=None): """ tmp table, with namespace == job_id """ rdd = materialize(rdd) name = name or str(uuid.uuid1()) return RDDTable(session_id=self._session_id, namespace=self._namespace, name=name, partitions=rdd.getNumPartitions(), rdd=rdd, dtable=None)
def _check_data(train: RDD = None, test: RDD = None) -> (RDD, int): # Data-type check if isinstance(train, RDD): is_legal_train = train.map( lambda u: len(u) >= 3 and u[0] is not None and u[1] is not None and isinstance(u[2], Number)).reduce(lambda u1, u2: u1 and u2) if not is_legal_train: raise ValueError( "Parameter train should be an RDD<(user, item, rating)>") num_partitions_of_train = train.getNumPartitions() return train if isinstance(test, RDD): is_legal_test = test.map(lambda u: len(u) >= 2 and u[0] is not None and u[1] is not None).reduce( lambda u1, u2: u1 and u2) if not is_legal_test: raise ValueError( "Parameter train should be an RDD<(user, item, rating)>") num_partitions_of_test = test.getNumPartitions() return test raise ValueError("RDD train/test need to be input.")
def repar_rdd(rdd: RDD, rdd_count: int, example_per_par=100000, coalesce_only=True): """ repar rdd based on number of example. if coalesce_only is False and expected partition is greater than current partition then nothing will happen """ num_partition = rdd.getNumPartitions() expect_partition = max(1, int(rdd_count / example_per_par)) if expect_partition < num_partition: rdd = rdd.coalesce(expect_partition) elif expect_partition > num_partition and coalesce_only is False: rdd = rdd.repartition(expect_partition) return rdd
def __call__(self, rdd: RDD, **kwargs: Any) -> RDD: """ Performs a single step of an algorithm, running all operations in sequence and ensuring data is partitioned correctly. Any additional keyword arguments passed to this function will be available in all life-cycle functions of the step: - `group` - `emit_by_group` - `broadcast` - `step` **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.** """ if rdd.getNumPartitions() != self._n_partitions: rdd = rdd.repartition(self._n_partitions) step_cls: Type[Step] = self.__class__ rdd = step_cls.group( rdd, **kwargs ).cache() # cache because we use it twice (emit and step) def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]: k, v = kv new_v = step_cls.emit_by_group(k, v, **kwargs) return new_v emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect()) to_broadcast = step_cls.broadcast(emitted, **kwargs) broadcast: Broadcast = self._sc.broadcast(to_broadcast) def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]: k, v = kv for new_v in step_cls.step(k, v, broadcast, **kwargs): yield new_v rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True) return rdd
def _zipWithIndex(rdd: RDD, to_rows: bool = False) -> RDD: """ Modified from https://github.com/davies/spark/blob/cebe5bfe263baf3349353f1473f097396821514a/python/pyspark/rdd.py """ starts = [0] if rdd.getNumPartitions() > 1: nums = rdd.mapPartitions(lambda it: [sum(1 for i in it)]).collect() for i in range(len(nums) - 1): starts.append(starts[-1] + nums[i]) def func1(k, it): # pragma: no cover for i, v in enumerate(it, starts[k]): yield i, v def func2(k, it): # pragma: no cover for i, v in enumerate(it, starts[k]): yield list(v) + [i] if not to_rows: return rdd.mapPartitionsWithIndex(func1) else: return rdd.mapPartitionsWithIndex(func2)
def _join(rdd: RDD, other: RDD, func=None): num_partitions = max(rdd.getNumPartitions(), other.getNumPartitions()) rtn_rdd = rdd.join(other, numPartitions=num_partitions) if func is not None: rtn_rdd = _map_value(rtn_rdd, lambda x: func(x[0], x[1])) return rtn_rdd
def _subtract_by_key(rdd: RDD, other: RDD): return rdd.subtractByKey(other, rdd.getNumPartitions())