def predict(self, data, batch_size=4, feature_cols=None): """ Predict input data. :param data: data to be predicted. It can be an XShards or a Spark Dataframe. If it is an XShards, each partition is a dictionary of {'x': feature}, where feature is a numpy array or a list of numpy arrays. :param batch_size: batch size used for inference. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: None. :return: predicted result. The predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where result is a numpy array or a list of numpy arrays. """ from zoo.orca.learn.utils import convert_predict_rdd_to_xshard if isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample data_rdd = data.rdd.flatMap(xshard_to_sample) elif isinstance(data, DataFrame): schema = data.schema data_rdd = data.rdd.map( lambda row: row_to_sample(row, schema, feature_cols, None)) else: raise ValueError( "Data should be XShards, each element needs to be {'x': a feature " "numpy array}.") predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size) if isinstance(data, SparkXShards): result = convert_predict_rdd_to_xshard(data, predicted_rdd) else: result = convert_predict_rdd_to_dataframe(data, predicted_rdd) return result
def predict( self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd
def predict( self, data, batch_size=4, feature_cols=None, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShards, each partition is a dictionary of {'x': feature}, where feature is a numpy array or a tuple of numpy arrays. If data is tf.data.Dataset, each element is feature tensor tuple :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is also a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) predicted_rdd = self.model.predict(dataset, batch_size) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd
def test_convert_predict_rdd_to_xshard(self): rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) shards = rdd.mapPartitions(lambda iter: chunks(iter, 5)).map( lambda x: {"x": np.stack(x)}) shards = SparkXShards(shards) pred_rdd = self.sc.range(0, 110).map(lambda x: np.array([x] * 50)) result_shards = convert_predict_rdd_to_xshard(shards, pred_rdd) result = np.concatenate( [shard["prediction"] for shard in result_shards.collect()]) expected_result = np.concatenate( [shard["x"] for shard in result_shards.collect()]) assert np.array_equal(result, expected_result)
def predict(self, data, batch_size=4, feature_cols="features", sample_preprocessing=None): if isinstance(data, DataFrame): if isinstance(feature_cols, list): data, _, feature_cols = \ BigDLEstimator._combine_cols(data, feature_cols, col_name="features") self.nn_model.setBatchSize(batch_size).setFeaturesCol(feature_cols) if sample_preprocessing is not None: self.nn_model.setSamplePreprocessing(sample_preprocessing) return self.nn_model.transform(data) elif isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample from zoo.orca.learn.utils import convert_predict_rdd_to_xshard sample_rdd = data.rdd.flatMap(xshard_to_sample) result_rdd = self.model.predict(sample_rdd) return convert_predict_rdd_to_xshard(data, result_rdd) else: raise ValueError("Data should be XShards or Spark DataFrame, but get " + data.__class__.__name__)
def predict(self, data, batch_size=4, feature_cols="features", sample_preprocessing=None): """ Predict input data :param data: predict input data. It can be XShards or Spark DataFrame. If data is XShards, each partition is a dictionary of {'x': feature}, where feature is a numpy array or a list of numpy arrays. :param batch_size: Batch size used for inference. Default: 4. :param feature_cols: Feature column name(s) of data. Only used when data is a Spark DataFrame. Default: "features". :param sample_preprocessing: Used when data is a Spark DataFrame. If the user want change the default feature_preprocessing specified in Estimator.from_bigdl, the user can pass the new sample_preprocessing methods. :return: predicted result. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. If input data is an XShards, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where result is a numpy array or a list of numpy arrays. """ if isinstance(data, DataFrame): if isinstance(feature_cols, list): data, _, feature_cols = \ BigDLEstimator._combine_cols(data, feature_cols, col_name="features") self.nn_model.setBatchSize(batch_size).setFeaturesCol(feature_cols) if sample_preprocessing is not None: self.nn_model.setSamplePreprocessing(sample_preprocessing) return self.nn_model.transform(data) elif isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample from zoo.orca.learn.utils import convert_predict_rdd_to_xshard sample_rdd = data.rdd.flatMap(xshard_to_sample) result_rdd = self.model.predict(sample_rdd) return convert_predict_rdd_to_xshard(data, result_rdd) else: raise ValueError( "Data should be XShards or Spark DataFrame, but get " + data.__class__.__name__)
def predict(self, data, batch_size=4, feature_cols=None): from zoo.orca.learn.utils import convert_predict_rdd_to_xshard if isinstance(data, SparkXShards): from zoo.orca.data.utils import xshard_to_sample data_rdd = data.rdd.flatMap(xshard_to_sample) elif isinstance(data, DataFrame): schema = data.schema data_rdd = data.rdd.map( lambda row: row_to_sample(row, schema, feature_cols, None)) else: raise ValueError( "Data should be XShards, each element needs to be {'x': a feature " "numpy array}.") predicted_rdd = self.model.predict(data_rdd, batch_size=batch_size) if isinstance(data, SparkXShards): result = convert_predict_rdd_to_xshard(data, predicted_rdd) else: result = convert_predict_rdd_to_dataframe(data, predicted_rdd) return result
def predict( self, data, batch_size=4, feature_cols=None, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature}, where feature is a numpy array or a tuple of numpy arrays. :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame or XShards of Pandas DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" data = process_xshards_of_pandas_dataframe(data, feature_cols) assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd