예제 #1
0
    def transform(self, fn, dtype, skip_undefined, seed):
        """
        Implementation of apply(fn, dtype, skip_undefined, seed).

        Transform each element of the RDD by a given function. The result
        RDD is of type ``dtype``. ``fn`` should be a function that returns
        exactly one value which can be cast into the type specified by
        ``dtype``. 
        """
        self._entry(fn, dtype, skip_undefined, seed)
        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        def apply_and_cast(x, fn, dtype, skip_undefined):
            if is_missing(x) and skip_undefined: return None
            try:
                return dtype(fn(x))
            except TypeError:
                return TypeError

        res = self._rdd.map(lambda x: apply_and_cast(x, fn, dtype, skip_undefined))
        # search for type error and throw exception
        errs = res.filter(lambda x: x is TypeError).take(1)
        if len(errs) > 0:
            raise ValueError('type conversion failure')
        self._exit()
        return self._rv(res, dtype)
예제 #2
0
    def flat_map(self, fn, dtype, skip_undefined, seed):
        """
        Implementation of flat_map(fn, dtype, skip_undefined, seed).

        Transform each element of the RDD by a given function, then flatten. The result
        RDD is of type ``dtype``. ``fn`` should be a function that returns
        a list of values which can be cast into the type specified by
        ``dtype``. 
        """
        self._entry(fn, dtype, skip_undefined, seed)
        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)

        def apply_and_cast(x, fn, dtype, skip_undefined):
            if is_missing(x) and skip_undefined: return []
            try:
                if skip_undefined:
                    return [dtype(item) for item in fn(x) if not is_missing(item)]
                return [dtype(item) for item in fn(x)]
            except TypeError:
                return TypeError

        res = self._rdd.flatMap(lambda x: apply_and_cast(x, fn, dtype, skip_undefined))

        # search for type error and throw exception
        try:
            errs = res.filter(lambda x: x is TypeError).take(1)
        except Exception:
            raise ValueError('type conversion failure')
        if len(errs) > 0:
            raise ValueError('type conversion failure')
        self._exit()
        return self._rv(res, dtype)
예제 #3
0
    def filter(self, fn, skip_undefined, seed):
        """
        Filter this RDD by a function.

        Returns a new RDD filtered by this RDD.  If `fn` evaluates an
        element to True, this element is copied to the new RDD. If not, it
        isn't. Throws an exception if the return type of `fn` is not castable
        to a boolean value.
        """
        self._entry(fn, skip_undefined, seed)

        if seed:
            distribute_seed(self._rdd, seed)
            random.seed(seed)
        def apply_filter(x, fn, skip_undefined):
            if x is None and skip_undefined: return None
            return fn(x)
        res = self._rdd.filter(lambda x: apply_filter(x, fn, skip_undefined))
        self._exit()
        return self._rv(res)