Exemplo n.º 1
0
def process_spark_xshards(spark_xshards, num_workers):
    from zoo.orca.data.ray_xshards import RayXShards
    data = spark_xshards
    if data.num_partitions() != num_workers:
        data = data.repartition(num_workers)
    ray_xshards = RayXShards.from_spark_xshards(data)
    return ray_xshards
Exemplo n.º 2
0
    def _predict_spark_xshards(self, xshards, params):
        ray_xshards = RayXShards.from_spark_xshards(xshards)

        def transform_func(worker, shards_ref):
            params["data_creator"] = make_data_creator(shards_ref)
            return worker.predict.remote(**params)

        pred_shards = ray_xshards.transform_shards_with_actors(
            self.remote_workers, transform_func)
        spark_xshards = pred_shards.to_spark_xshards()
        return spark_xshards
Exemplo n.º 3
0
def process_spark_xshards(spark_xshards, num_workers):
    data = spark_xshards
    if data.num_partitions() != num_workers:
        data = data.repartition(num_workers)

    # todo currently we need this information to pad the short partitions
    # so that every model run exactly the same number of steps in one epoch
    max_length = data.rdd.map(data_length) \
        .mapPartitions(lambda iterator: [sum(iterator)]).max()
    ray_xshards = RayXShards.from_spark_xshards(data)
    return max_length, ray_xshards
Exemplo n.º 4
0
def get_ray_xshards():
    from zoo.orca.data import XShards
    import numpy as np

    ndarray_dict = {"x": np.random.randn(10, 4), "y": np.random.randn(10, 4)}

    spark_xshards = XShards.partition(ndarray_dict)

    ray_xshards = RayXShards.from_spark_xshards(spark_xshards)

    return ray_xshards, ndarray_dict
    def _predict_spark_xshards(self, xshards, param):
        ray_xshards = RayXShards.from_spark_xshards(xshards)

        def transform_func(worker, shards_ref):
            data_creator = lambda config, batch_size: shards_ref
            return worker.predict.remote(data_creator, **param)

        pred_shards = ray_xshards.transform_shards_with_actors(
            self.remote_workers, transform_func)
        spark_xshards = pred_shards.to_spark_xshards()
        return spark_xshards
Exemplo n.º 6
0
    def evaluate(self,
                 data,
                 batch_size=32,
                 num_steps=None,
                 verbose=1,
                 sample_weight=None,
                 callbacks=None,
                 data_config=None,
                 feature_cols=None,
                 label_cols=None):
        """
        Evaluates the model on the validation data set.

        :param data: evaluate data. It can be XShards, Spark DataFrame or creator function which
               returns Iter or DataLoader.
               If data is XShards, each partition can be a Pandas DataFrame or a dictionary of
               {'x': feature, 'y': label}, where feature(label) is a numpy array or a tuple of
               numpy arrays.
        :param batch_size: Batch size used for evaluation. Default: 32.
        :param num_steps: Total number of steps (batches of samples) before declaring the evaluation
               round finished. Ignored with the default value of `None`.
        :param verbose: Prints output of one model if true.
        :param sample_weight: Optional Numpy array of weights for the training samples, used for
               weighting the loss function. You can either pass a flat (1D) Numpy array with the
               same length as the input samples (1:1 mapping between weights and samples), or in
               the case of temporal data, you can pass a 2D array with shape (samples,
               sequence_length), to apply a different weight to every timestep of every sample.
        :param callbacks: List of Keras compatible callbacks to apply during evaluation.
        :param data_config: An optional dictionary that can be passed to data creator function.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame or an XShards of Pandas DataFrame. Default: None.
        :param label_cols: Label column name(s) of data. Only used when data is a Spark DataFrame or
               an XShards of Pandas DataFrame.
               Default: None.
        :return: validation result
        """
        logger.info("Starting validation step.")
        params = dict(
            batch_size=batch_size,
            verbose=verbose,
            sample_weight=sample_weight,
            steps=num_steps,
            callbacks=callbacks,
            data_config=data_config,
        )
        from zoo.orca.data import SparkXShards

        data, _ = maybe_dataframe_to_xshards(data,
                                             validation_data=None,
                                             feature_cols=feature_cols,
                                             label_cols=label_cols,
                                             mode="evaluate")

        if isinstance(data, SparkXShards):
            if data._get_class_name() == 'pandas.core.frame.DataFrame':
                data = process_xshards_of_pandas_dataframe(
                    data, feature_cols, label_cols)

            data = data
            if data.num_partitions() != self.num_workers:
                data = data.repartition(self.num_workers)

            ray_xshards = RayXShards.from_spark_xshards(data)

            def transform_func(worker, partition_refs):
                params["data_creator"] = make_data_creator(partition_refs)
                return worker.validate.remote(**params)

            worker_stats = ray_xshards.reduce_partitions_for_actors(
                self.remote_workers, transform_func)
        else:  # data_creator functions; should return Iter or DataLoader
            params["data_creator"] = data
            params_list = [params] * self.num_workers

            worker_stats = ray.get([
                w.validate.remote(**params_list[i])
                for i, w in enumerate(self.remote_workers)
            ])
            worker_stats = list(itertools.chain.from_iterable(worker_stats))
        stats = worker_stats[0].copy()
        return stats
Exemplo n.º 7
0
def process_spark_xshards(spark_xshards, num_workers):
    from zoo.orca.data.ray_xshards import RayXShards
    data = spark_xshards
    ray_xshards = RayXShards.from_spark_xshards(data)
    return ray_xshards