def predict(self, data, batch_size=4, feature_cols=None): from zoo.orca.learn.utils import convert_predict_to_xshard if isinstance(data, SparkXShards): from zoo.orca.data.utils import to_sample data_rdd = data.rdd.flatMap(to_sample) else: raise ValueError("Data should be XShards, each element needs to be {'x': a feature " "numpy array}.") predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size) return convert_predict_to_xshard(predicted_rdd)
def predict( self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False, auto_shard_files=True, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShard, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: if require hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance( data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd
def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance(data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd
def predict(self, data, batch_size=8, feature_cols="features", sample_preprocessing=None): if isinstance(data, DataFrame): if isinstance(feature_cols, list): data, _, feature_cols = \ BigDLEstimatorWrapper._combine_cols(data, feature_cols, col_name="features") self.nn_model.setBatchSize(batch_size).setFeaturesCol(feature_cols) if sample_preprocessing is not None: self.nn_model.setSamplePreprocessing(sample_preprocessing) return self.nn_model.transform(data) elif isinstance(data, SparkXShards): from zoo.orca.data.utils import to_sample from zoo.orca.learn.utils import convert_predict_to_xshard sample_rdd = data.rdd.flatMap(to_sample) result_rdd = self.model.predict(sample_rdd) return convert_predict_to_xshard(result_rdd) else: raise ValueError( "Data should be XShards or Spark DataFrame, but get " + data.__class__.__name__)
def predict(self, data, **kwargs): def predict_transform(dict_data, batch_size): assert isinstance(dict_data, dict), "each shard should be an dict" assert "x" in dict_data, "key x should in each shard" feature_data = dict_data["x"] if isinstance(feature_data, np.ndarray): assert feature_data.shape[1] <= batch_size, \ "The batch size of input data (the second dim) should be less than the model " \ "batch size, otherwise some inputs will be ignored." elif isinstance(feature_data, list): for elem in feature_data: assert isinstance(elem, np.ndarray), "Each element in the x list should be " \ "a ndarray, but get " + \ elem.__class__.__name__ assert elem.shape[1] <= batch_size, "The batch size of each input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some inputs " \ "will be ignored." else: raise ValueError( "x in each shard should be a ndarray or a list of ndarray." ) return dict_data["x"] sc = init_nncontext() if isinstance(data, SparkXShards): assert sc is not None, "You should pass sc(spark context) if data is a XShards." from zoo.orca.learn.utils import convert_predict_to_xshard data = data.transform_shard(predict_transform, self.batch_size) result_rdd = self.model.distributed_predict(data.rdd, sc) return convert_predict_to_xshard(result_rdd) elif isinstance(data, (np.ndarray, list)): total_core_num = self.core_num * self.node_num if isinstance(data, np.ndarray): assert data.shape[1] <= self.batch_size, "The batch size of input data (the " \ "second dim) should be less than the " \ "model batch size, otherwise some " \ "inputs will be ignored." split_num = min(total_core_num, data.shape[0]) arrays = np.array_split(data, split_num) data_rdd = sc.parallelize(arrays, numSlices=split_num) elif isinstance(data, list): flattened = nest.flatten(data) data_length = len(flattened[0]) data_to_be_rdd = [] split_num = min(total_core_num, flattened[0].shape[0]) for i in range(split_num): data_to_be_rdd.append([]) for x in flattened: assert isinstance(x, np.ndarray), "the data in the data list should be " \ "ndarrays, but get " + \ x.__class__.__name__ assert len(x) == data_length, \ "the ndarrays in data must all have the same size in first dimension" \ ", got first ndarray of size {} and another {}".format(data_length, len(x)) assert x.shape[1] <= self.batch_size, "The batch size of each input data (" \ "the second dim) should be less than " \ "the model batch size, otherwise some " \ "inputs will be ignored." x_parts = np.array_split(x, split_num) for idx, x_part in enumerate(x_parts): data_to_be_rdd[idx].append(x_part) data_to_be_rdd = [ nest.pack_sequence_as(data, shard) for shard in data_to_be_rdd ] data_rdd = sc.parallelize(data_to_be_rdd, numSlices=split_num) result_rdd = self.model.distributed_predict(data_rdd, sc) result_arr_list = result_rdd.collect() result_arr = np.concatenate(result_arr_list, axis=0) return result_arr else: raise ValueError( "Only XShards, a numpy array and a list of numpy arrays are supported " "as input data, but get " + data.__class__.__name__)