예제 #1
0
 def predict(self, data, batch_size=4, feature_cols=None):
     from zoo.orca.learn.utils import convert_predict_to_xshard
     if isinstance(data, SparkXShards):
         from zoo.orca.data.utils import to_sample
         data_rdd = data.rdd.flatMap(to_sample)
     else:
         raise ValueError("Data should be XShards, each element needs to be {'x': a feature "
                          "numpy array}.")
     predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size)
     return convert_predict_to_xshard(predicted_rdd)
예제 #2
0
    def predict(
        self,
        data,
        batch_size=4,
        feature_cols=None,
        hard_code_batch_size=False,
        auto_shard_files=True,
    ):
        """
        Predict input data
        :param data: data to be predicted.
        It can be XShards, Spark DataFrame, or tf.data.Dataset.
        If data is XShard, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        If data is tf.data.Dataset, each element is feature tensor tuple
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: if require hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is also a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes
         original columns plus 'prediction' column. The 'prediction' column can be FloatType,
         VectorUDT or Array of VectorUDT depending on model outputs shape.
        """

        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        dataset = to_dataset(
            data,
            batch_size=-1,
            batch_per_thread=batch_size,
            validation_data=None,
            feature_cols=feature_cols,
            labels_cols=None,
            hard_code_batch_size=hard_code_batch_size,
            sequential_order=True,
            shuffle=False,
            auto_shard_files=auto_shard_files,
        )

        predicted_rdd = self.model.predict(dataset, batch_size)
        if isinstance(data, DataFrame):
            return convert_predict_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards) or isinstance(
                data, tf.data.Dataset):
            return convert_predict_to_xshard(predicted_rdd)
        else:
            return predicted_rdd
예제 #3
0
    def predict(self, data, batch_size=4,
                feature_cols=None,
                hard_code_batch_size=False,
                auto_shard_files=False,
                ):
        """
        Predict input data
        :param data: data to be predicted. It can be XShards, Spark DataFrame.
        If data is XShards, each element needs to be {'x': a feature numpy array
         or a tuple of feature numpy arrays}.
        :param batch_size: batch size per thread
        :param feature_cols: list of feature column names if input data is Spark DataFrame.
        :param hard_code_batch_size: whether to hard code batch size for prediction.
         The default value is False.
        :return: predicted result.
         If input data is XShards or tf.data.Dataset, the predict result is a XShards,
         and the schema for each result is: {'prediction': predicted numpy array or
          list of predicted numpy arrays}.
         If input data is Spark DataFrame, the predict result is a DataFrame which includes original
         columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT
         or Array of VectorUDT depending on model outputs shape.
        """

        assert self.outputs is not None, \
            "output is None, it should not be None in prediction"
        if isinstance(data, DataFrame):
            assert feature_cols is not None, \
                "feature columns is None; it should not be None in prediction"

        assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \
                                             "estimator prediction"

        dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size,
                             validation_data=None,
                             feature_cols=feature_cols, labels_cols=None,
                             hard_code_batch_size=hard_code_batch_size,
                             sequential_order=True,
                             shuffle=False,
                             auto_shard_files=auto_shard_files,
                             )

        flat_inputs = nest.flatten(self.inputs)
        flat_outputs = nest.flatten(self.outputs)
        tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs)
        predicted_rdd = tfnet.predict(dataset)
        if isinstance(data, DataFrame):
            return convert_predict_to_dataframe(data, predicted_rdd)
        elif isinstance(data, SparkXShards) or isinstance(data, tf.data.Dataset):
            return convert_predict_to_xshard(predicted_rdd)
        else:
            return predicted_rdd
예제 #4
0
 def predict(self,
             data,
             batch_size=8,
             feature_cols="features",
             sample_preprocessing=None):
     if isinstance(data, DataFrame):
         if isinstance(feature_cols, list):
             data, _, feature_cols = \
                 BigDLEstimatorWrapper._combine_cols(data, feature_cols, col_name="features")
         self.nn_model.setBatchSize(batch_size).setFeaturesCol(feature_cols)
         if sample_preprocessing is not None:
             self.nn_model.setSamplePreprocessing(sample_preprocessing)
         return self.nn_model.transform(data)
     elif isinstance(data, SparkXShards):
         from zoo.orca.data.utils import to_sample
         from zoo.orca.learn.utils import convert_predict_to_xshard
         sample_rdd = data.rdd.flatMap(to_sample)
         result_rdd = self.model.predict(sample_rdd)
         return convert_predict_to_xshard(result_rdd)
     else:
         raise ValueError(
             "Data should be XShards or Spark DataFrame, but get " +
             data.__class__.__name__)
예제 #5
0
    def predict(self, data, **kwargs):
        def predict_transform(dict_data, batch_size):
            assert isinstance(dict_data, dict), "each shard should be an dict"
            assert "x" in dict_data, "key x should in each shard"
            feature_data = dict_data["x"]
            if isinstance(feature_data, np.ndarray):
                assert feature_data.shape[1] <= batch_size, \
                    "The batch size of input data (the second dim) should be less than the model " \
                    "batch size, otherwise some inputs will be ignored."
            elif isinstance(feature_data, list):
                for elem in feature_data:
                    assert isinstance(elem, np.ndarray), "Each element in the x list should be " \
                                                         "a ndarray, but get " + \
                                                         elem.__class__.__name__
                    assert elem.shape[1] <= batch_size, "The batch size of each input data (the " \
                                                        "second dim) should be less than the " \
                                                        "model batch size, otherwise some inputs " \
                                                        "will be ignored."
            else:
                raise ValueError(
                    "x in each shard should be a ndarray or a list of ndarray."
                )
            return dict_data["x"]

        sc = init_nncontext()

        if isinstance(data, SparkXShards):
            assert sc is not None, "You should pass sc(spark context) if data is a XShards."
            from zoo.orca.learn.utils import convert_predict_to_xshard
            data = data.transform_shard(predict_transform, self.batch_size)
            result_rdd = self.model.distributed_predict(data.rdd, sc)
            return convert_predict_to_xshard(result_rdd)
        elif isinstance(data, (np.ndarray, list)):
            total_core_num = self.core_num * self.node_num
            if isinstance(data, np.ndarray):
                assert data.shape[1] <= self.batch_size, "The batch size of input data (the " \
                                                         "second dim) should be less than the " \
                                                         "model batch size, otherwise some " \
                                                         "inputs will be ignored."
                split_num = min(total_core_num, data.shape[0])
                arrays = np.array_split(data, split_num)
                data_rdd = sc.parallelize(arrays, numSlices=split_num)
            elif isinstance(data, list):
                flattened = nest.flatten(data)
                data_length = len(flattened[0])
                data_to_be_rdd = []
                split_num = min(total_core_num, flattened[0].shape[0])
                for i in range(split_num):
                    data_to_be_rdd.append([])
                for x in flattened:
                    assert isinstance(x, np.ndarray), "the data in the data list should be " \
                                                      "ndarrays, but get " + \
                                                      x.__class__.__name__
                    assert len(x) == data_length, \
                        "the ndarrays in data must all have the same size in first dimension" \
                        ", got first ndarray of size {} and another {}".format(data_length, len(x))
                    assert x.shape[1] <= self.batch_size, "The batch size of each input data (" \
                                                          "the second dim) should be less than " \
                                                          "the model batch size, otherwise some " \
                                                          "inputs will be ignored."
                    x_parts = np.array_split(x, split_num)
                    for idx, x_part in enumerate(x_parts):
                        data_to_be_rdd[idx].append(x_part)

                data_to_be_rdd = [
                    nest.pack_sequence_as(data, shard)
                    for shard in data_to_be_rdd
                ]
                data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num)

            result_rdd = self.model.distributed_predict(data_rdd, sc)
            result_arr_list = result_rdd.collect()
            result_arr = np.concatenate(result_arr_list, axis=0)
            return result_arr
        else:
            raise ValueError(
                "Only XShards, a numpy array and a list of numpy arrays are supported "
                "as input data, but get " + data.__class__.__name__)