Python RDD.cache 예제들

    def run(self, rdd: RDD) -> RDD:  # type: ignore
        rdd = rdd.cache()

        n_points = rdd.count()
        m = n_points / self.n_partitions
        optimal_p = math.log(n_points * self.n_partitions) / m

        rdd = self.assign_buckets(  # type: ignore
            rdd, p=optimal_p, key_func=_label_first_coord_and_type
        )
        rdd = self.sort_and_assign_labels(rdd)  # type: ignore

        return rdd

예제 #2

파일 보기

파일: tera_sort.py 프로젝트: kowaalczyk/spark-minimal-algorithms

    def run(
        self,
        rdd: RDD,
        key_func: Callable[[Tuple[Any]], Tuple[Any]] = lambda x: x
    ) -> RDD:  # type: ignore
        rdd = rdd.cache()

        n_points = rdd.count()
        m = n_points / self.n_partitions
        optimal_p = math.log(n_points * self.n_partitions) / m

        rdd = self.assign_buckets(rdd, p=optimal_p,
                                  key_func=key_func)  # type: ignore
        rdd = self.sort(rdd, key_func=key_func)  # type: ignore

        return rdd

예제 #3

파일 보기

파일: wick.py 프로젝트: rdguerrerom/drudge

    def normal_order(self, terms: RDD, **kwargs):
        """Normal order the terms according to generalized Wick theorem.

        The actual expansion is based on the information given in the subclasses
        by the abstract properties.

        """
        comparator = kwargs.pop('comparator', self.comparator)
        contractor = kwargs.pop('contractor', self.contractor)
        if len(kwargs) != 0:
            raise ValueError('Invalid arguments to Wick normal order', kwargs)

        phase = self.phase
        symms = self.symms
        resolvers = self.resolvers

        terms.cache()
        terms_to_proc = terms.filter(lambda x: len(x.vecs) > 1)
        keep_top = 0 if comparator is None else 1
        terms_to_keep = terms.filter(lambda x: len(x.vecs) <= keep_top)
        terms_to_proc.cache()
        if terms_to_proc.count() == 0:
            return terms_to_keep

        # Triples: term, contractions, schemes.
        wick_terms = terms_to_proc.map(lambda x: _prepare_wick(
            x, comparator, contractor, symms.value, resolvers.value))

        if self._wick_parallel == 0:

            normal_ordered = wick_terms.flatMap(lambda x: [
                _form_term_from_wick(x[0], x[1], phase, resolvers.value, i)
                for i in x[2]
            ])

        elif self._wick_parallel == 1:

            flattened = wick_terms.flatMap(
                lambda x: [(x[0], x[1], i) for i in x[2]])
            if self._num_partitions is not None:
                flattened = flattened.repartition(self._num_partitions)

            normal_ordered = flattened.map(lambda x: _form_term_from_wick(
                x[0], x[1], phase, resolvers.value, x[2]))

        elif self._wick_parallel == 2:

            # This level of parallelism is reserved for really hard problems.
            expanded = []
            for term, contrs, schemes in wick_terms.collect():
                # To work around a probable Spark bug.  Problem occurs when we
                # have closures inside a loop to be distributed out.
                form_term = functools.partial(_form_term_from_wick_bcast, term,
                                              contrs, phase, resolvers)

                curr = self._ctx.parallelize(schemes).map(form_term)
                expanded.append(curr)
                continue

            normal_ordered = self._ctx.union(expanded)

        else:
            raise ValueError('Invalid Wick expansion parallel level',
                             self._wick_parallel)

        return terms_to_keep.union(normal_ordered)