Exemplo n.º 1
0
    def evaluate(self,
                 data_creator,
                 verbose=1,
                 sample_weight=None,
                 steps=None,
                 callbacks=None,
                 data_config=None,
                 feature_cols=None,
                 label_cols=None):
        """Evaluates the model on the validation data set."""
        logger.info("Starting validation step.")
        params = dict(
            verbose=verbose,
            sample_weight=sample_weight,
            steps=steps,
            callbacks=callbacks,
            data_config=data_config,
        )
        from zoo.orca.data import SparkXShards
        from pyspark.sql import DataFrame

        if isinstance(data_creator, DataFrame):
            assert feature_cols is not None,\
                "feature_col must be provided if data_creator is a spark dataframe"
            assert label_cols is not None,\
                "label_cols must be provided if data_creator is a spark dataframe"
            schema = data_creator.schema
            numpy_rdd = data_creator.rdd.map(lambda row: convert_row_to_numpy(
                row, schema, feature_cols, label_cols))
            shard_rdd = numpy_rdd.mapPartitions(
                lambda x: arrays2dict(x, feature_cols, label_cols))
            data_creator = SparkXShards(shard_rdd)

        if isinstance(data_creator, SparkXShards):
            data = data_creator
            if data.num_partitions() != self.num_workers:
                data = data.repartition(self.num_workers)

            ray_xshards = RayXShards.from_spark_xshards(data)

            def transform_func(worker, shards_ref):
                params["data_creator"] = shards_ref_to_creator(shards_ref)
                return worker.validate.remote(**params)

            stats_shards = ray_xshards.transform_shards_with_actors(
                self.remote_workers, transform_func, gang_scheduling=True)
            worker_stats = stats_shards.collect()

        else:  # data_creator functions; should return Iter or DataLoader
            params["data_creator"] = data_creator
            params_list = [params] * self.num_workers

            worker_stats = ray.get([
                w.validate.remote(**params_list[i])
                for i, w in enumerate(self.remote_workers)
            ])
            worker_stats = list(itertools.chain.from_iterable(worker_stats))
        stats = worker_stats[0].copy()
        return stats
Exemplo n.º 2
0
def update_predict_xshards(xshard, pred_xshards):
    def updates(d1_d2):
        d1, d2 = d1_d2
        d1.update(d2)
        return d1

    result = SparkXShards(xshard.rdd.zip(pred_xshards.rdd).map(updates))
    return result
Exemplo n.º 3
0
 def load(self, model_path, minPartitions=None):
     """
     restore model from model file and config.
     :param model_path: the model file
     :return: the restored model
     """
     self.internal = SparkXShards.load_pickle(model_path,
                                              minPartitions=minPartitions)
Exemplo n.º 4
0
 def get_pred_xshards(key):
     rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
     shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
         lambda x: {
             key: np.stack(x)
         }).map(lambda x: {key: [x[key][:, :24], x[key][:, 24:]]})
     shards = SparkXShards(shards)
     return shards
    def test_openvino_predict_xshards(self):
        input_data_list = [np.array([self.input] * 4), np.array([self.input] * 2)]
        sc = init_nncontext()
        rdd = sc.parallelize(input_data_list, numSlices=2)
        shards = SparkXShards(rdd)

        def pre_processing(images):
            return {"x": images}

        shards = shards.transform_shard(pre_processing)
        result = self.est.predict(shards)
        result_c = result.collect()
        assert isinstance(result, SparkXShards)
        assert result_c[0]["prediction"].shape == (4, 1000)
        assert result_c[1]["prediction"].shape == (2, 1000)
        assert self.check_result(result_c[0]["prediction"], 4)
        assert self.check_result(result_c[1]["prediction"], 2)
    def test_transform_with_repartition(self):
        # shards of pandas dataframe
        file_path = os.path.join(self.resource_path, "orca/data/csv")
        data_shard = zoo.orca.data.pandas.read_csv(file_path)
        partitions = data_shard.rdd.glom().collect()
        for par in partitions:
            assert len(par) <= 1

        def negative(df, column_name):
            df[column_name] = df[column_name] * (-1)
            return df
        shard2 = data_shard.transform_shard(negative, "sale_price")

        shard3 = shard2.repartition(4)
        partitions3 = shard3.rdd.glom().collect()
        for par in partitions3:
            assert len(par) <= 1

        shard4 = shard2.repartition(1)
        partitions4 = shard4.rdd.glom().collect()
        for par in partitions4:
            assert len(par) <= 1

        shard5 = shard4.transform_shard(negative, "sale_price")
        partitions5 = shard5.rdd.glom().collect()
        for par in partitions5:
            assert len(par) <= 1
        # shards of list
        data = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]]
        sc = init_nncontext()
        rdd = sc.parallelize(data)
        data_shard = SparkXShards(rdd)
        shard2 = data_shard.repartition(6)
        partitions2 = shard2.rdd.glom().collect()
        for par in partitions2:
            assert len(par) <= 1
        shard3 = data_shard.repartition(1)
        partitions2 = shard3.rdd.glom().collect()
        for par in partitions2:
            assert len(par) <= 1

        # shards of numpy array
        data = [np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
                np.array([9, 10, 11, 12]), np.array([13, 14, 15, 16])]
        sc = init_nncontext()
        rdd = sc.parallelize(data)
        data_shard = SparkXShards(rdd)
        shard2 = data_shard.repartition(6)
        partitions2 = shard2.rdd.glom().collect()
        for par in partitions2:
            assert len(par) <= 1
        shard3 = data_shard.repartition(1)
        partitions2 = shard3.rdd.glom().collect()
        for par in partitions2:
            assert len(par) <= 1
Exemplo n.º 7
0
 def test_spark_xshards(self):
     from zoo import init_nncontext
     from zoo.orca.data import SparkXShards
     estimator = get_estimator(workers_per_node=1)
     sc = init_nncontext()
     x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32))
     # torch 1.7.1+ requires target size same as output size, which is (batch, 1)
     y_rdd = sc.parallelize(
         np.random.randint(0, 2, size=(4000, 1, 1)).astype(np.float32))
     rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]})
     train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1])
     train_xshards = SparkXShards(train_rdd)
     val_xshards = SparkXShards(val_rdd)
     train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2)
     print(train_stats)
     val_stats = estimator.evaluate(val_xshards, batch_size=128)
     print(val_stats)
     estimator.shutdown()
Exemplo n.º 8
0
def _dataframe_to_xshards(data, feature_cols, label_cols=None):
    from zoo.orca import OrcaContext
    schema = data.schema
    shard_size = OrcaContext._shard_size
    numpy_rdd = data.rdd.map(lambda row: convert_row_to_numpy(
        row, schema, feature_cols, label_cols))
    shard_rdd = numpy_rdd.mapPartitions(
        lambda x: arrays2dict(x, feature_cols, label_cols, shard_size))
    return SparkXShards(shard_rdd)
Exemplo n.º 9
0
    def test_openvino_predict_xshards(self):
        input_data_list = [
            np.random.random([1, 4, 3, 224, 224]),
            np.random.random([2, 4, 3, 224, 224])
        ]
        sc = init_nncontext()
        rdd = sc.parallelize(input_data_list, numSlices=2)
        shards = SparkXShards(rdd)

        def pre_processing(images):
            return {"x": images}

        shards = shards.transform_shard(pre_processing)
        result = self.est.predict(shards)
        result_c = result.collect()
        assert isinstance(result, SparkXShards)
        assert result_c[0]["prediction"].shape == (1, 4, 1000)
        assert result_c[1]["prediction"].shape == (2, 4, 1000)
Exemplo n.º 10
0
    def test_zip(self):
        def negative(df, column_name, minus_val):
            df[column_name] = df[column_name] * (-1)
            df[column_name] = df[column_name] - minus_val
            return df

        file_path = os.path.join(self.resource_path, "orca/data/json")
        data_shard = zoo.orca.data.pandas.read_json(file_path,
                                                    orient='columns',
                                                    lines=True)
        data_shard = data_shard.repartition(2)
        data_shard.cache()
        transformed_shard = data_shard.transform_shard(negative, "value", 2)
        zipped_shard = data_shard.zip(transformed_shard)
        assert not transformed_shard.is_cached(
        ), "transformed_shard should be uncached."
        data = zipped_shard.collect()
        assert data[0][0]["value"].values[0] + data[0][1]["value"].values[0] == -2, \
            "value should be -2"
        list1 = list([1, 2, 3])
        with self.assertRaises(Exception) as context:
            data_shard.zip(list1)
        self.assertTrue(
            'other should be a SparkXShards' in str(context.exception))
        transformed_shard = transformed_shard.repartition(
            data_shard.num_partitions() - 1)
        with self.assertRaises(Exception) as context:
            data_shard.zip(transformed_shard)
        self.assertTrue(
            'The two SparkXShards should have the same number of partitions' in
            str(context.exception))
        dict_data = [{"x": 1, "y": 2}, {"x": 2, "y": 3}]
        sc = init_nncontext()
        rdd = sc.parallelize(dict_data)
        dict_shard = SparkXShards(rdd)
        dict_shard = dict_shard.repartition(1)
        with self.assertRaises(Exception) as context:
            transformed_shard.zip(dict_shard)
        self.assertTrue(
            'The two SparkXShards should have the same number of elements in '
            'each partition' in str(context.exception))
 def test_spark_xshards(self):
     from zoo import init_nncontext
     from zoo.orca.data import SparkXShards
     estimator = Estimator.from_torch(model=get_model,
                                      optimizer=get_optimizer,
                                      loss=nn.BCELoss(),
                                      config={"lr": 1e-1},
                                      backend="torch_distributed")
     sc = init_nncontext()
     x_rdd = sc.parallelize(np.random.rand(4000, 1, 50).astype(np.float32))
     y_rdd = sc.parallelize(
         np.random.randint(0, 2, size=(4000, 1)).astype(np.float32))
     rdd = x_rdd.zip(y_rdd).map(lambda x_y: {'x': x_y[0], 'y': x_y[1]})
     train_rdd, val_rdd = rdd.randomSplit([0.9, 0.1])
     train_xshards = SparkXShards(train_rdd)
     val_xshards = SparkXShards(val_rdd)
     train_stats = estimator.fit(train_xshards, batch_size=256, epochs=2)
     print(train_stats)
     val_stats = estimator.evaluate(val_xshards, batch_size=128)
     print(val_stats)
     estimator.shutdown()
Exemplo n.º 12
0
 def to_spark_xshards(self):
     from zoo.orca.data import SparkXShards
     ray_ctx = RayContext.get()
     sc = ray_ctx.sc
     address = ray_ctx.redis_address
     password = ray_ctx.redis_password
     num_parts = self.num_partitions()
     partition2store = self.partition2store_name
     rdd = sc.parallelize([0] * num_parts * 10, num_parts)\
         .mapPartitionsWithIndex(
         lambda idx, _: get_from_ray(idx, address, password, partition2store))
     spark_xshards = SparkXShards(rdd)
     return spark_xshards
Exemplo n.º 13
0
def convert_predict_to_xshard(prediction_rdd):
    def transform_predict(iter):
        predictions = list(iter)
        # list of np array
        if isinstance(predictions[0], list):
            predictions = np.array(predictions).T.tolist()
            result = [np.array(predict) for predict in predictions]
            return [{'prediction': result}]
        # np array
        else:
            return [{'prediction': np.array(predictions)}]

    return SparkXShards(prediction_rdd.mapPartitions(transform_predict))
Exemplo n.º 14
0
    def test_convert_predict_rdd_to_xshard(self):
        rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)
        pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50))
        result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)
Exemplo n.º 15
0
    def test_xshards_predict(self):

        sc = init_nncontext()
        rdd = sc.range(0, 110).map(lambda x: np.array([x] * 50))
        shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map(
            lambda x: {"x": np.stack(x)})
        shards = SparkXShards(shards)

        estimator = get_estimator(workers_per_node=2,
                                  model_fn=lambda config: IdentityNet())
        result_shards = estimator.predict(shards, batch_size=4)
        result = np.concatenate(
            [shard["prediction"] for shard in result_shards.collect()])
        expected_result = np.concatenate(
            [shard["x"] for shard in result_shards.collect()])

        assert np.array_equal(result, expected_result)
Exemplo n.º 16
0
def convert_predict_rdd_to_xshard(data, prediction_rdd):
    import numpy as np
    from zoo.orca.data import SparkXShards

    def group_index(iter):
        for data in iter:
            size = get_size(data["x"])
            for i in range(size):
                yield size

    def transform_predict(predictions):
        # list of np array
        if isinstance(predictions[0], list):
            predictions = np.array(predictions).T.tolist()
            result = [np.array(predict) for predict in predictions]
            return result
        # np array
        else:
            return np.array(predictions)

    def group(iter):
        this_index = 0
        buffer = []
        this_count = None
        for (count, pred) in iter:
            if this_index == 0:
                this_count = count
            if this_index < this_count:
                buffer.append(pred)
                this_index += 1
            if this_index == this_count:
                yield transform_predict(buffer)
                buffer.clear()
                this_index = 0

    def add_pred(shard_pred):
        shard, pred = shard_pred
        shard["prediction"] = pred
        return shard

    indexed_rdd = data.rdd.mapPartitions(group_index)
    grouped_pred = indexed_rdd.zip(prediction_rdd).mapPartitions(group)
    result_rdd = data.rdd.zip(grouped_pred).map(add_pred)
    return SparkXShards(result_rdd)
Exemplo n.º 17
0
    def to_spark_xshards(self):
        from zoo.orca.data import SparkXShards
        ray_ctx = RayContext.get()
        sc = ray_ctx.sc
        address = ray_ctx.redis_address
        password = ray_ctx.redis_password
        num_parts = self.num_partitions()
        partition2store = self.partition2store_name
        rdd = sc.parallelize([0] * num_parts * 10, num_parts)\
            .mapPartitionsWithIndex(
            lambda idx, _: get_from_ray(idx, address, password, partition2store))

        # the reason why we trigger computation here is to ensure we get the data
        # from ray before the RayXShards goes out of scope and the data get garbage collected
        from pyspark.storagelevel import StorageLevel
        rdd = rdd.cache()
        result_rdd = rdd.map(lambda x: x)  # sparkxshards will uncache the rdd when gc
        spark_xshards = SparkXShards(result_rdd)
        return spark_xshards
Exemplo n.º 18
0
    def predict(self,
                data_creator,
                batch_size=None,
                verbose=1,
                steps=None,
                callbacks=None,
                data_config=None,
                feature_cols=None):
        """Evaluates the model on the validation data set."""
        logger.info("Starting predict step.")
        params = dict(
            verbose=verbose,
            batch_size=batch_size,
            steps=steps,
            callbacks=callbacks,
            data_config=data_config,
        )
        from zoo.orca.data import SparkXShards
        from pyspark.sql import DataFrame
        if isinstance(data_creator, DataFrame):
            assert feature_cols is not None,\
                "feature_col must be provided if data_creator is a spark dataframe"
            schema = data_creator.schema
            numpy_rdd = data_creator.rdd.map(lambda row: convert_row_to_numpy(
                row, schema, feature_cols, None))
            shard_rdd = numpy_rdd.mapPartitions(
                lambda x: arrays2dict(x, feature_cols, None))
            data_creator = SparkXShards(shard_rdd)
        if isinstance(data_creator, SparkXShards):
            ray_xshards = RayXShards.from_spark_xshards(data_creator)

            def transform_func(worker, shards_ref):
                params["data_creator"] = shards_ref_to_creator(shards_ref)
                return worker.predict.remote(**params)

            stats_shards = ray_xshards.transform_shards_with_actors(
                self.remote_workers, transform_func, gang_scheduling=False)
            spark_xshards = stats_shards.to_spark_xshards()

        else:
            raise ValueError("Only xshards is supported for predict")

        return spark_xshards
Exemplo n.º 19
0
    def _read_as_xshards(path):
        rdd, schema = ParquetDataset._read_as_dict_rdd(path)

        def merge_records(schema, iter):
            l = list(iter)
            result = {}
            for k in schema.keys():
                result[k] = []
            for i, rec in enumerate(l):

                for k in schema.keys():
                    result[k].append(rec[k])
            for k, v in schema.items():
                if not v.feature_type == FeatureType.IMAGE:
                    result[k] = np.stack(result[k])

            return [result]

        result_rdd = rdd.mapPartitions(lambda iter: merge_records(schema, iter))
        xshards = SparkXShards(result_rdd)
        return xshards
Exemplo n.º 20
0
def read_parquet(file_path, columns=None, schema=None, **options):
    """
    Read parquet files to SparkXShards of pandas DataFrames.

    :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory
    containing parquet files. Local file system, HDFS, and AWS S3 are supported.
    :param columns: list of column name, default=None.
    If not None, only these columns will be read from the file.
    :param schema: pyspark.sql.types.StructType for the input schema or
    a DDL-formatted string (For example col0 INT, col1 DOUBLE).
    :param options: other options for reading parquet.
    :return: An instance of SparkXShards.
    """
    sc = init_nncontext()
    from pyspark.sql import SQLContext
    sqlContext = SQLContext.getOrCreate(sc)
    spark = sqlContext.sparkSession
    # df = spark.read.parquet(file_path)
    df = spark.read.load(file_path, "parquet", schema=schema, **options)

    if columns:
        df = df.select(*columns)

    def to_pandas(columns):
        def f(iter):
            import pandas as pd
            data = list(iter)
            pd_df = pd.DataFrame(data, columns=columns)
            return [pd_df]

        return f

    pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns))
    try:
        data_shards = SparkXShards(pd_rdd)
    except Exception as e:
        print("An error occurred when reading parquet files")
        raise e
    return data_shards
Exemplo n.º 21
0
def read_parquet(file_path, columns=None, **kwargs):
    """
    Read parquet files to SparkXShards of pandas DataFrames.

    :param file_path: Parquet file path, a list of multiple parquet file paths, or a directory
    containing parquet files. Local file system, HDFS, and AWS S3 are supported.
    :param columns: list of column name, default=None.
    If not None, only these columns will be read from the file.
    :param kwargs: Any additional kwargs.
    :return: An instance of SparkXShards.
    """
    sc = init_nncontext()
    from pyspark.sql import SQLContext
    sqlContext = SQLContext.getOrCreate(sc)
    spark = sqlContext.sparkSession
    df = spark.read.parquet(file_path)

    if columns:
        df = df.select(*columns)

    def to_pandas(columns):
        def f(iter):
            import pandas as pd
            data = list(iter)
            pd_df = pd.DataFrame(data, columns=columns)
            return [pd_df]

        return f

    pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns))
    try:
        data_shards = SparkXShards(pd_rdd)
    except Exception as e:
        print("An error occurred when reading parquet files")
        raise e
    return data_shards
Exemplo n.º 22
0
    def predict(self, data, feature_cols=None, batch_size=4):
        """
        Predict input data

        :param batch_size: Int. Set batch Size, default is 4.
        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        sc = init_nncontext()
        model_bytes_broadcast = sc.broadcast(self.model_bytes)
        weight_bytes_broadcast = sc.broadcast(self.weight_bytes)

        def partition_inference(partition):
            model_bytes = model_bytes_broadcast.value
            weight_bytes = weight_bytes_broadcast.value
            partition = list(partition)
            data_num = len(partition)
            ie = IECore()
            config = {'CPU_THREADS_NUM': str(self.core_num)}
            ie.set_config(config, 'CPU')
            net = ie.read_network(model=model_bytes,
                                  weights=weight_bytes,
                                  init_from_buffer=True)
            net.batch_size = batch_size
            local_model = ie.load_network(network=net,
                                          device_name="CPU",
                                          num_requests=data_num)
            inputs = list(iter(local_model.requests[0].input_blobs))
            outputs = list(iter(local_model.requests[0].output_blobs))
            assert len(
                outputs) != 0, "The number of model outputs should not be 0."

            def add_elem(d):
                d_len = len(d)
                if d_len < batch_size:
                    rep_time = [1] * (d_len - 1)
                    rep_time.append(batch_size - d_len + 1)
                    return np.repeat(d, rep_time, axis=0), d_len
                else:
                    return d, d_len

            results = []
            for idx, batch_data in enumerate(partition):
                infer_request = local_model.requests[idx]
                input_dict = dict()
                elem_num = 0
                if isinstance(batch_data, list):
                    for i, input in enumerate(inputs):
                        input_dict[input], elem_num = add_elem(batch_data[i])
                else:
                    input_dict[inputs[0]], elem_num = add_elem(batch_data)
                infer_request.infer(input_dict)
                if len(outputs) == 1:
                    results.append(infer_request.output_blobs[
                        outputs[0]].buffer[:elem_num])
                else:
                    results.append(
                        list(
                            map(
                                lambda output: infer_request.output_blobs[
                                    output].buffer[:elem_num], outputs)))

            return results

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        if isinstance(data, DataFrame):
            from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    batch_size)
            result_rdd = transformed_data.rdd.mapPartitions(
                lambda iter: partition_inference(iter))

            def update_result_shard(data):
                shard, y = data
                shard["prediction"] = y
                return shard

            return SparkXShards(
                data.rdd.zip(result_rdd).map(update_result_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / batch_size)
                arrays = np.array_split(data, split_num)
                num_slices = min(split_num, self.node_num)
                data_rdd = sc.parallelize(arrays, numSlices=num_slices)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / batch_size)
                num_slices = min(split_num, self.node_num)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=num_slices)

            print("Partition number: ", data_rdd.getNumPartitions())
            result_rdd = data_rdd.mapPartitions(
                lambda iter: partition_inference(iter))
            result_arr_list = result_rdd.collect()
            result_arr = None
            if isinstance(result_arr_list[0], list):
                result_arr = [
                    np.concatenate([r[i] for r in result_arr_list], axis=0)
                    for i in range(len(result_arr_list[0]))
                ]
            elif isinstance(result_arr_list[0], np.ndarray):
                result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)
Exemplo n.º 23
0
def read_file_spark(file_path, file_type, **kwargs):
    sc = init_nncontext()
    file_url_splits = file_path.split("://")
    prefix = file_url_splits[0]
    node_num, core_num = get_node_and_core_number()

    file_paths = []
    if isinstance(file_path, list):
        [
            file_paths.extend(extract_one_path(path, file_type, os.environ))
            for path in file_path
        ]
    else:
        file_paths = extract_one_path(file_path, file_type, os.environ)

    if not file_paths:
        raise Exception(
            "The file path is invalid/empty or does not include csv/json files"
        )

    if ZooContext.orca_pandas_read_backend == "pandas":
        num_files = len(file_paths)
        total_cores = node_num * core_num
        num_partitions = num_files if num_files < total_cores else total_cores
        rdd = sc.parallelize(file_paths, num_partitions)

        if prefix == "hdfs":
            pd_rdd = rdd.mapPartitions(
                lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs))
        elif prefix == "s3":
            pd_rdd = rdd.mapPartitions(
                lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs))
        else:

            def loadFile(iterator):
                for x in iterator:
                    df = read_pd_file(x, file_type, **kwargs)
                    yield df

            pd_rdd = rdd.mapPartitions(loadFile)
    else:
        from pyspark.sql import SQLContext
        sqlContext = SQLContext.getOrCreate(sc)
        spark = sqlContext.sparkSession
        # TODO: add S3 confidentials
        if file_type == "json":
            df = spark.read.json(file_paths, **kwargs)
        elif file_type == "csv":
            df = spark.read.csv(file_paths, **kwargs)
        else:
            raise Exception("Unsupported file type")
        if df.rdd.getNumPartitions() < node_num:
            df = df.repartition(node_num)

        def to_pandas(columns):
            def f(iter):
                import pandas as pd
                data = list(iter)
                yield pd.DataFrame(data, columns=columns)

            return f

        pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns))

    data_shards = SparkXShards(pd_rdd)
    return data_shards
Exemplo n.º 24
0
    def fit(
        self,
        data_creator,
        epochs=1,
        verbose=1,
        callbacks=None,
        validation_data_creator=None,
        class_weight=None,
        steps_per_epoch=None,
        validation_steps=None,
        validation_freq=1,
        data_config=None,
        feature_cols=None,
        label_cols=None,
    ):
        """Runs a training epoch."""
        params = dict(epochs=epochs,
                      verbose=verbose,
                      callbacks=callbacks,
                      class_weight=class_weight,
                      steps_per_epoch=steps_per_epoch,
                      validation_steps=validation_steps,
                      validation_freq=validation_freq,
                      data_config=data_config)

        from zoo.orca.data import SparkXShards
        from pyspark.sql import DataFrame
        if isinstance(data_creator, DataFrame):
            assert feature_cols is not None,\
                "feature_col must be provided if data_creator is a spark dataframe"
            assert label_cols is not None,\
                "label_cols must be provided if data_creator is a spark dataframe"
            schema = data_creator.schema
            numpy_rdd = data_creator.rdd.map(lambda row: convert_row_to_numpy(
                row, schema, feature_cols, label_cols))
            shard_rdd = numpy_rdd.mapPartitions(
                lambda x: arrays2dict(x, feature_cols, label_cols))
            data_creator = SparkXShards(shard_rdd)

        if isinstance(data_creator, SparkXShards):
            max_length, ray_xshards = process_spark_xshards(
                data_creator, self.num_workers)

            if validation_data_creator is None:

                def transform_func(worker, shards_ref):
                    params["data_creator"] = shards_ref_to_creator(shards_ref)
                    return worker.step.remote(**params)

                stats_shards = ray_xshards.transform_shards_with_actors(
                    self.remote_workers, transform_func, gang_scheduling=True)
            else:
                val_max_length, val_ray_xshards = process_spark_xshards(
                    validation_data_creator, self.num_workers)

                def zip_func(worker, this_shards_ref, that_shards_ref):
                    params["data_creator"] = shards_ref_to_creator(
                        this_shards_ref)
                    params["validation_data_creator"] =\
                        shards_ref_to_creator(that_shards_ref)
                    return worker.step.remote(**params)

                stats_shards = ray_xshards.zip_shards_with_actors(
                    val_ray_xshards,
                    self.remote_workers,
                    zip_func,
                    gang_scheduling=True)
            worker_stats = stats_shards.collect()
        else:
            params["data_creator"] = data_creator
            params["validation_data_creator"] = validation_data_creator
            params_list = [params] * self.num_workers

            worker_stats = ray.get([
                self.remote_workers[i].step.remote(**params_list[i])
                for i in range(self.num_workers)
            ])
            worker_stats = list(itertools.chain.from_iterable(worker_stats))
        stats = worker_stats[0].copy()
        return stats
    def test_nnEstimator(self):
        from zoo.pipeline.nnframes import NNModel
        linear_model = Sequential().add(Linear(2, 2))
        mse_criterion = MSECriterion()
        df = self.get_estimator_df()
        est = Estimator.from_bigdl(model=linear_model,
                                   loss=mse_criterion,
                                   optimizer=Adam(),
                                   feature_preprocessing=SeqToTensor([2]),
                                   label_preprocessing=SeqToTensor([2]))
        res0 = est.predict(df)
        res0_c = res0.collect()
        est.fit(df, 1, batch_size=4)
        nn_model = NNModel(est.get_model(),
                           feature_preprocessing=SeqToTensor([2]))
        res1 = nn_model.transform(df)
        res2 = est.predict(df)
        res1_c = res1.collect()
        res2_c = res2.collect()
        assert type(res1).__name__ == 'DataFrame'
        assert type(res2).__name__ == 'DataFrame'
        assert len(res1_c) == len(res2_c)
        for idx in range(len(res1_c)):
            assert res1_c[idx]["prediction"] == res2_c[idx]["prediction"]
        with tempfile.TemporaryDirectory() as tempdirname:
            temp_path = os.path.join(tempdirname, "model")
            est.save(temp_path)
            est2 = Estimator.from_bigdl(model=linear_model, loss=mse_criterion)
            est2.load(temp_path,
                      optimizer=Adam(),
                      loss=mse_criterion,
                      feature_preprocessing=SeqToTensor([2]),
                      label_preprocessing=SeqToTensor([2]))
            est2.set_constant_gradient_clipping(0.1, 1.2)
            est2.clear_gradient_clipping()
            res3 = est2.predict(df)
            res3_c = res3.collect()
            assert type(res3).__name__ == 'DataFrame'
            assert len(res1_c) == len(res3_c)
            for idx in range(len(res1_c)):
                assert res1_c[idx]["prediction"] == res3_c[idx]["prediction"]
            est2.fit(df, 4, batch_size=4)

        data = self.sc.parallelize([((2.0, 1.0), (1.0, 2.0)),
                                    ((1.0, 2.0), (2.0, 1.0)),
                                    ((2.0, 1.0), (1.0, 2.0)),
                                    ((1.0, 2.0), (2.0, 1.0))])
        data_shard = SparkXShards(data)
        data_shard = data_shard.transform_shard(
            lambda feature_label_tuple: {
                "x": [
                    np.expand_dims(np.array(feature_label_tuple[0][0]), axis=0
                                   ),
                    np.expand_dims(np.array(feature_label_tuple[0][1]), axis=0)
                ],
                "y": [
                    np.expand_dims(np.array(feature_label_tuple[1][0]), axis=0
                                   ),
                    np.expand_dims(np.array(feature_label_tuple[1][1]), axis=0)
                ]
            })
        res4 = est.predict(data_shard)
        res4_c = res4.collect()
        assert type(res4).__name__ == 'SparkXShards'
        for idx in range(len(res4_c)):
            assert abs(res4_c[idx]["prediction"][0][0] -
                       res3_c[idx]["prediction"][0]) == 0
            assert abs(res4_c[idx]["prediction"][0][1] -
                       res3_c[idx]["prediction"][1]) == 0
        est.fit(data_shard, 1, batch_size=4)
        res5 = est.predict(data_shard)
        res5_c = res5.collect()
        res6 = est.predict(df)
        res6_c = res6.collect()
        for idx in range(len(res5_c)):
            assert abs(res5_c[idx]["prediction"][0][0] -
                       res6_c[idx]["prediction"][0]) == 0
            assert abs(res5_c[idx]["prediction"][0][1] -
                       res6_c[idx]["prediction"][1]) == 0
Exemplo n.º 26
0
def read_file_spark(file_path, file_type, **kwargs):
    sc = init_nncontext()
    node_num, core_num = get_node_and_core_number()
    backend = OrcaContext.pandas_read_backend

    if backend == "pandas":
        file_url_splits = file_path.split("://")
        prefix = file_url_splits[0]

        file_paths = []
        if isinstance(file_path, list):
            [
                file_paths.extend(extract_one_path(path, os.environ))
                for path in file_path
            ]
        else:
            file_paths = extract_one_path(file_path, os.environ)

        if not file_paths:
            raise Exception(
                "The file path is invalid or empty, please check your data")

        num_files = len(file_paths)
        total_cores = node_num * core_num
        num_partitions = num_files if num_files < total_cores else total_cores
        rdd = sc.parallelize(file_paths, num_partitions)

        if prefix == "hdfs":
            pd_rdd = rdd.mapPartitions(
                lambda iter: read_pd_hdfs_file_list(iter, file_type, **kwargs))
        elif prefix == "s3":
            pd_rdd = rdd.mapPartitions(
                lambda iter: read_pd_s3_file_list(iter, file_type, **kwargs))
        else:

            def loadFile(iterator):
                for x in iterator:
                    df = read_pd_file(x, file_type, **kwargs)
                    yield df

            pd_rdd = rdd.mapPartitions(loadFile)
    else:  # Spark backend; spark.read.csv/json accepts a folder path as input
        assert file_type == "json" or file_type == "csv", \
            "Unsupported file type: %s. Only csv and json files are supported for now" % file_type
        from pyspark.sql import SQLContext
        sqlContext = SQLContext.getOrCreate(sc)
        spark = sqlContext.sparkSession
        # TODO: add S3 confidentials

        # The following implementation is adapted from
        # https://github.com/databricks/koalas/blob/master/databricks/koalas/namespace.py
        # with some modifications.

        if "mangle_dupe_cols" in kwargs:
            assert kwargs[
                "mangle_dupe_cols"], "mangle_dupe_cols can only be True"
            kwargs.pop("mangle_dupe_cols")
        if "parse_dates" in kwargs:
            assert not kwargs["parse_dates"], "parse_dates can only be False"
            kwargs.pop("parse_dates")

        names = kwargs.get("names", None)
        if "names" in kwargs:
            kwargs.pop("names")
        usecols = kwargs.get("usecols", None)
        if "usecols" in kwargs:
            kwargs.pop("usecols")
        dtype = kwargs.get("dtype", None)
        if "dtype" in kwargs:
            kwargs.pop("dtype")
        squeeze = kwargs.get("squeeze", False)
        if "squeeze" in kwargs:
            kwargs.pop("squeeze")
        index_col = kwargs.get("index_col", None)
        if "index_col" in kwargs:
            kwargs.pop("index_col")

        if file_type == "csv":
            # Handle pandas-compatible keyword arguments
            kwargs["inferSchema"] = True
            header = kwargs.get("header", "infer")
            if isinstance(names, str):
                kwargs["schema"] = names
            if header == "infer":
                header = 0 if names is None else None
            if header == 0:
                kwargs["header"] = True
            elif header is None:
                kwargs["header"] = False
            else:
                raise ValueError("Unknown header argument {}".format(header))
            if "quotechar" in kwargs:
                quotechar = kwargs["quotechar"]
                kwargs.pop("quotechar")
                kwargs["quote"] = quotechar
            if "escapechar" in kwargs:
                escapechar = kwargs["escapechar"]
                kwargs.pop("escapechar")
                kwargs["escape"] = escapechar
            # sep and comment are the same as pandas
            if "comment" in kwargs:
                comment = kwargs["comment"]
                if not isinstance(comment, str) or len(comment) != 1:
                    raise ValueError(
                        "Only length-1 comment characters supported")
            df = spark.read.csv(file_path, **kwargs)
            if header is None:
                df = df.selectExpr(*[
                    "`%s` as `%s`" % (field.name, i)
                    for i, field in enumerate(df.schema)
                ])
        else:
            df = spark.read.json(file_path, **kwargs)

        # Handle pandas-compatible postprocessing arguments
        if usecols is not None and not callable(usecols):
            usecols = list(usecols)
        renamed = False
        if isinstance(names, list):
            if len(set(names)) != len(names):
                raise ValueError(
                    "Found duplicate names, please check your names input")
            if usecols is not None:
                if not callable(usecols):
                    # usecols is list
                    if len(names) != len(usecols) and len(names) != len(
                            df.schema):
                        raise ValueError("Passed names did not match usecols")
                if len(names) == len(df.schema):
                    df = df.selectExpr(*[
                        "`%s` as `%s`" % (field.name, name)
                        for field, name in zip(df.schema, names)
                    ])
                    renamed = True

            else:
                if len(names) != len(df.schema):
                    raise ValueError(
                        "The number of names [%s] does not match the number "
                        "of columns [%d]. Try names by a Spark SQL DDL-formatted "
                        "string." % (len(names), len(df.schema)))
                df = df.selectExpr(*[
                    "`%s` as `%s`" % (field.name, name)
                    for field, name in zip(df.schema, names)
                ])
                renamed = True
        index_map = dict([(i, field.name)
                          for i, field in enumerate(df.schema)])
        if usecols is not None:
            if callable(usecols):
                cols = [
                    field.name for field in df.schema if usecols(field.name)
                ]
                missing = []
            elif all(isinstance(col, int) for col in usecols):
                cols = [
                    field.name for i, field in enumerate(df.schema)
                    if i in usecols
                ]
                missing = [
                    col for col in usecols
                    if col >= len(df.schema) or df.schema[col].name not in cols
                ]
            elif all(isinstance(col, str) for col in usecols):
                cols = [
                    field.name for field in df.schema if field.name in usecols
                ]
                if isinstance(names, list):
                    missing = [c for c in usecols if c not in names]
                else:
                    missing = [col for col in usecols if col not in cols]
            else:
                raise ValueError(
                    "usecols must only be list-like of all strings, "
                    "all unicode, all integers or a callable.")
            if len(missing) > 0:
                raise ValueError(
                    "usecols do not match columns, columns expected but not found: %s"
                    % missing)
            if len(cols) > 0:
                df = df.select(cols)
                if isinstance(names, list):
                    if not renamed:
                        df = df.selectExpr(*[
                            "`%s` as `%s`" % (col, name)
                            for col, name in zip(cols, names)
                        ])
                        # update index map after rename
                        for index, col in index_map.items():
                            if col in cols:
                                index_map[index] = names[cols.index(col)]

        if df.rdd.getNumPartitions() < node_num:
            df = df.repartition(node_num)

        def to_pandas(columns, squeeze=False, index_col=None):
            def f(iter):
                import pandas as pd
                data = list(iter)
                pd_df = pd.DataFrame(data, columns=columns)
                if dtype is not None:
                    if isinstance(dtype, dict):
                        for col, type in dtype.items():
                            if isinstance(col, str):
                                if col not in pd_df.columns:
                                    raise ValueError(
                                        "column to be set type is not"
                                        " in current dataframe")
                                pd_df[col] = pd_df[col].astype(type)
                            elif isinstance(col, int):
                                if index_map[col] not in pd_df.columns:
                                    raise ValueError(
                                        "column index to be set type is not"
                                        " in current dataframe")
                                pd_df[index_map[col]] = pd_df[
                                    index_map[col]].astype(type)
                    else:
                        pd_df = pd_df.astype(dtype)
                if squeeze and len(pd_df.columns) == 1:
                    pd_df = pd_df.iloc[:, 0]
                if index_col:
                    pd_df = pd_df.set_index(index_col)

                return [pd_df]

            return f

        pd_rdd = df.rdd.mapPartitions(to_pandas(df.columns, squeeze,
                                                index_col))

    try:
        data_shards = SparkXShards(pd_rdd)
    except Exception as e:
        alternative_backend = "pandas" if backend == "spark" else "spark"
        print(
            "An error occurred when reading files with '%s' backend, you may switch to '%s' "
            "backend for another try. You can set the backend using "
            "OrcaContext.pandas_read_backend" % (backend, alternative_backend))
        raise e
    return data_shards
Exemplo n.º 27
0
    def predict(self, data):
        """
        Predict input data

        :param data: data to be predicted. XShards, numpy array and list of numpy arrays are
        supported. If data is XShards, each partition is a dictionary of  {'x': feature},
        where feature(label) is a numpy array or a list of numpy arrays.
        :return: predicted result.
         If the input data is XShards, the predict result is a XShards, each partition of the
         XShards is a dictionary of {'prediction': result}, where the result is a numpy array or
         a list of numpy arrays.
         If the input data is numpy arrays or list of numpy arrays, the predict result is a numpy
         array or a list of numpy arrays.
        """
        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[1] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[1] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return dict_data["x"]

        sc = init_nncontext()

        if isinstance(data, SparkXShards):
            assert sc is not None, "You should pass sc(spark context) if data is a XShards."
            from zoo.orca.learn.utils import convert_predict_rdd_to_xshard
            transformed_data = data.transform_shard(predict_transform,
                                                    self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def update_shard(data):
                shard, y = data
                shard["prediction"] = y
                return shard

            return SparkXShards(data.rdd.zip(result_rdd).map(update_shard))
        elif isinstance(data, (np.ndarray, list)):
            total_core_num = self.core_num * self.node_num
            if isinstance(data, np.ndarray):
                assert data.shape[1] <= self.batch_size, "The batch size of input data (the " \
                                                         "second dim) should be less than the " \
                                                         "model batch size, otherwise some " \
                                                         "inputs will be ignored."
                split_num = min(total_core_num, data.shape[0])
                arrays = np.array_split(data, split_num)
                data_rdd = sc.parallelize(arrays, numSlices=split_num)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = min(total_core_num, flattened[0].shape[0])
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    assert x.shape[1] <= self.batch_size, "The batch size of each input data (" \
                                                          "the second dim) should be less than " \
                                                          "the model batch size, otherwise some " \
                                                          "inputs will be ignored."
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num)

            result_rdd = self.model.distributed_predict(data_rdd, sc)
            result_arr_list = result_rdd.collect()
            result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, a numpy array and a list of numpy arrays are supported "
                "as input data, but get " + data.__class__.__name__)
Exemplo n.º 28
0
    def predict(self, data, feature_cols=None):
        """
        Predict input data

        :param data: data to be predicted. XShards, Spark DataFrame, numpy array and list of numpy
               arrays are supported. If data is XShards, each partition is a dictionary of  {'x':
               feature}, where feature(label) is a numpy array or a list of numpy arrays.
        :param feature_cols: Feature column name(s) of data. Only used when data is a Spark
               DataFrame. Default: None.
        :return: predicted result.
                 If the input data is XShards, the predict result is a XShards, each partition
                 of the XShards is a dictionary of {'prediction': result}, where the result is a
                 numpy array or a list of numpy arrays.
                 If the input data is numpy arrays or list of numpy arrays, the predict result is
                 a numpy array or a list of numpy arrays.
        """
        from pyspark.sql import DataFrame

        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[0] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[0] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return feature_data

        sc = init_nncontext()

        if isinstance(data, DataFrame):
            from zoo.orca.learn.utils import dataframe_to_xshards, convert_predict_rdd_to_dataframe
            xshards, _ = dataframe_to_xshards(data,
                                              validation_data=None,
                                              feature_cols=feature_cols,
                                              label_cols=None,
                                              mode="predict")
            transformed_data = xshards.transform_shard(predict_transform,
                                                       self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def delete_useless_result(data):
                shard, y = data
                data_length = len(shard["x"])
                return y[:data_length]

            result_rdd = xshards.rdd.zip(result_rdd).map(delete_useless_result)
            return convert_predict_rdd_to_dataframe(
                data, result_rdd.flatMap(lambda data: data))
        elif isinstance(data, SparkXShards):
            transformed_data = data.transform_shard(predict_transform,
                                                    self.batch_size)
            result_rdd = self.model.distributed_predict(
                transformed_data.rdd, sc)

            def update_shard(data):
                shard, y = data
                data_length = len(shard["x"])
                shard["prediction"] = y[:data_length]
                return shard

            return SparkXShards(data.rdd.zip(result_rdd).map(update_shard))
        elif isinstance(data, (np.ndarray, list)):
            if isinstance(data, np.ndarray):
                split_num = math.ceil(len(data) / self.batch_size)
                arrays = np.array_split(data, split_num)
                data_length_list = list(map(lambda arr: len(arr), arrays))
                data_rdd = sc.parallelize(arrays, numSlices=split_num)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = math.ceil(flattened[0].shape[0] / self.batch_size)
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)
                        data_length_list = list(
                            map(lambda arr: len(arr), x_part))

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num)

            result_rdd = self.model.distributed_predict(data_rdd, sc)
            result_arr_list = result_rdd.collect()
            for i in range(0, len(result_arr_list)):
                result_arr_list[i] = result_arr_list[i][:data_length_list[i]]
            result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, Spark DataFrame, a numpy array and a list of numpy arr"
                "ays are supported as input data, but get " +
                data.__class__.__name__)