Python RDD.repartition示例

    def _evaluate(self, rdd: RDD, **kwargs):
        yaml_model = self.master_network.to_yaml()
        optimizer = deserialize_optimizer(self.master_optimizer)
        loss = self.master_loss
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects
        metrics = self.master_metrics

        def _evaluate(model, optimizer, loss, custom_objects, metrics, kwargs,
                      data_iterator):
            model = model_from_yaml(model, custom_objects)
            model.compile(optimizer, loss, metrics)
            model.set_weights(weights.value)
            feature_iterator, label_iterator = tee(data_iterator, 2)
            x_test = np.asarray([x for x, y in feature_iterator])
            y_test = np.asarray([y for x, y in label_iterator])
            return [model.evaluate(x_test, y_test, **kwargs)]

        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        results = rdd.mapPartitions(
            partial(_evaluate, yaml_model, optimizer, loss, custom_objects,
                    metrics, kwargs))
        if not metrics:
            # if no metrics, we can just return the scalar corresponding to the loss value
            return results.mean()
        else:
            # if we do have metrics, we want to return a list of [loss value, metric value] - to match the keras API
            loss_value = results.map(lambda x: x[0]).mean()
            metric_value = results.map(lambda x: x[1]).mean()
            return [loss_value, metric_value]

示例#2

显示文件

文件： spark_model.py 项目： rishabh706/elephas

    def _predict(self, rdd: RDD):
        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)
        yaml_model = self.master_network.to_yaml()
        weights = self.master_network.get_weights()
        weights = rdd.context.broadcast(weights)
        custom_objects = self.custom_objects

        def _predict(model, custom_objects, data):
            model = model_from_yaml(model, custom_objects)
            model.set_weights(weights.value)
            data = np.array([x for x in data])
            return model.predict(data)

        predictions = rdd.mapPartitions(partial(_predict, yaml_model, custom_objects)).collect()
        return predictions

示例#3

显示文件

文件： pyspark_utils.py 项目： JayYip/bert-multitask-learning

def repar_rdd(rdd: RDD,
              rdd_count: int,
              example_per_par=100000,
              coalesce_only=True):
    """
    repar rdd based on number of example. if coalesce_only is False and expected
    partition is greater than current partition then nothing will happen
    """
    num_partition = rdd.getNumPartitions()
    expect_partition = max(1, int(rdd_count / example_per_par))

    if expect_partition < num_partition:
        rdd = rdd.coalesce(expect_partition)
    elif expect_partition > num_partition and coalesce_only is False:
        rdd = rdd.repartition(expect_partition)

    return rdd

示例#4

显示文件

文件： transform.py 项目： xiashuijun/search-MjoLniR

def partition_per_row(rdd: RDD) -> RDD:
    """Place each row in an RDD into a separate partition.

    Only useful if that row represents something large to be computed over,
    perhaps an external resource such as a multi-gb training dataset. The spark
    part of the dataset is expected to be tiny and easily fit in a single
    partition.
    """
    num_rows = rdd.count()
    # Help out mypy. Also don't use `identity`, as it somehow fails serialization
    partition_fn = cast(Callable[[int], int], lambda x: x)

    return (
        # bring everything together and assign each row a partition id
        rdd.repartition(1).mapPartitions(lambda rows: enumerate(rows))
        # Partition by the new parition_id
        .partitionBy(num_rows, partition_fn)
        # Drop the partition id, giving back the origional shape
        .map(lambda pair: pair[1]))

示例#5

显示文件

文件： spark_model.py 项目： rishabh706/elephas

    def fit(self, rdd: RDD, **kwargs):
        """
        Train an elephas model on an RDD. The Keras model configuration as specified
        in the elephas model is sent to Spark workers, abd each worker will be trained
        on their data partition.

        :param rdd: RDD with features and labels
        :param epochs: number of epochs used for training
        :param batch_size: batch size used for training
        :param verbose: logging verbosity level (0, 1 or 2)
        :param validation_split: percentage of data set aside for validation
        """
        print('>>> Fit model')
        if self.num_workers:
            rdd = rdd.repartition(self.num_workers)

        if self.mode in ['asynchronous', 'synchronous', 'hogwild']:
            self._fit(rdd, **kwargs)
        else:
            raise ValueError(
                "Choose from one of the modes: asynchronous, synchronous or hogwild")

示例#6

显示文件

文件： algorithm.py 项目： kowaalczyk/spark-minimal-algorithms

    def __call__(self, rdd: RDD, **kwargs: Any) -> RDD:
        """
        Performs a single step of an algorithm, running all operations in sequence
        and ensuring data is partitioned correctly.

        Any additional keyword arguments passed to this function will be available
        in all life-cycle functions of the step:
        - `group`
        - `emit_by_group`
        - `broadcast`
        - `step`

        **DO NOT OVERRIDE WHEN DEFINING CUSTOM STEPS.**
        """
        if rdd.getNumPartitions() != self._n_partitions:
            rdd = rdd.repartition(self._n_partitions)

        step_cls: Type[Step] = self.__class__
        rdd = step_cls.group(
            rdd, **kwargs
        ).cache()  # cache because we use it twice (emit and step)

        def unwrap_emit(kv: Tuple[Any, Iterable[Any]]) -> Optional[Tuple[Any, Any]]:
            k, v = kv
            new_v = step_cls.emit_by_group(k, v, **kwargs)
            return new_v

        emitted = list(rdd.map(unwrap_emit, preservesPartitioning=True).collect())
        to_broadcast = step_cls.broadcast(emitted, **kwargs)
        broadcast: Broadcast = self._sc.broadcast(to_broadcast)

        def unwrap_step(kv: Tuple[Any, Iterable[Any]]) -> Iterable[Any]:
            k, v = kv
            for new_v in step_cls.step(k, v, broadcast, **kwargs):
                yield new_v

        rdd = rdd.flatMap(unwrap_step, preservesPartitioning=True)
        return rdd