def predict(self, data, batch_size=32): assert self.outputs is not None, \ "output is None, it should not be None in prediction" dataset = _to_dataset(data, batch_size=-1, batch_per_thread=batch_size) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) return tfnet.predict(dataset)
def test_tfdataset_with_string_rdd(self): string_rdd = self.sc.parallelize(["123", "456"], 1) ds = TFDataset.from_string_rdd(string_rdd, batch_per_thread=1) input_tensor = tf.placeholder(dtype=tf.string, shape=(None, )) output_tensor = tf.string_to_number(input_tensor) with tf.Session() as sess: tfnet = TFNet.from_session(sess, inputs=[input_tensor], outputs=[output_tensor]) result = tfnet.predict(ds).collect() assert result[0] == 123 assert result[1] == 456
def predict(self, data, batch_size=4, feature_cols=None, hard_code_batch_size=False): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame, or tf.data.Dataset. If data is XShards, each element needs to be {'x': a feature numpy array or a tuple of feature numpy arrays}. If data is tf.data.Dataset, each element is a tuple of input tensors. :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame. :param hard_code_batch_size: whether to hard code batch size for prediction. The default value is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is a XShards, and the schema for each result is: {'prediction': predicted numpy array or list of predicted numpy arrays}. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" dataset = to_dataset(data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, labels_cols=None, hard_code_batch_size=hard_code_batch_size, sequential_order=True, shuffle=False) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards) or isinstance( data, tf.data.Dataset): return convert_predict_to_xshard(predicted_rdd) else: return predicted_rdd
def predict(self, data, batch_size=32): assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, SparkXShards): dataset = _xshards_to_tf_dataset(data, batch_per_thread=batch_size) elif isinstance(data, Dataset): dataset = TFDataDataset2(data, batch_size=-1, batch_per_thread=batch_size) else: raise ValueError("data must be a SparkXShards or an orca.data.tf.Dataset") flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) return tfnet.predict(dataset)
def predict( self, data, batch_size=4, feature_cols=None, auto_shard_files=False, ): """ Predict input data :param data: data to be predicted. It can be XShards, Spark DataFrame. If data is XShards, each partition can be Pandas Dataframe or a dictionary of {'x': feature}, where feature is a numpy array or a tuple of numpy arrays. :param batch_size: batch size per thread :param feature_cols: list of feature column names if input data is Spark DataFrame or XShards of Pandas DataFrame. :param auto_shard_files: whether to automatically detect if the dataset is file-based and and apply sharding on files, otherwise sharding on records. Default is False. :return: predicted result. If input data is XShards or tf.data.Dataset, the predict result is a XShards, each partition of the XShards is a dictionary of {'prediction': result}, where the result is a numpy array or a list of numpy arrays. If input data is Spark DataFrame, the predict result is a DataFrame which includes original columns plus 'prediction' column. The 'prediction' column can be FloatType, VectorUDT or Array of VectorUDT depending on model outputs shape. """ assert self.outputs is not None, \ "output is None, it should not be None in prediction" if isinstance(data, DataFrame): assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" if isinstance(data, SparkXShards): if data._get_class_name() == 'pandas.core.frame.DataFrame': assert feature_cols is not None, \ "feature columns is None; it should not be None in prediction" data = process_xshards_of_pandas_dataframe(data, feature_cols) assert not is_tf_data_dataset(data), "tf.data.Dataset currently cannot be used for" \ "estimator prediction" dataset = to_dataset( data, batch_size=-1, batch_per_thread=batch_size, validation_data=None, feature_cols=feature_cols, label_cols=None, hard_code_batch_size=False, sequential_order=True, shuffle=False, auto_shard_files=auto_shard_files, ) flat_inputs = nest.flatten(self.inputs) flat_outputs = nest.flatten(self.outputs) tfnet = TFNet.from_session(sess=self.sess, inputs=flat_inputs, outputs=flat_outputs) predicted_rdd = tfnet.predict(dataset) if isinstance(data, DataFrame): return convert_predict_rdd_to_dataframe(data, predicted_rdd) elif isinstance(data, SparkXShards): return convert_predict_rdd_to_xshard(data, predicted_rdd) else: return predicted_rdd
if __name__ == '__main__': sparkConf = init_spark_conf().setAppName("testNNClassifer").setMaster('local[1]') sc = init_nncontext(sparkConf) spark = SparkSession \ .builder \ .getOrCreate() with tf.Graph().as_default(): input1 = tf.placeholder(dtype=tf.float32, shape=(None, 2)) hidden = tf.layers.dense(input1, 4) output = tf.sigmoid(tf.layers.dense(hidden, 1)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) net = TFNet.from_session(sess, [input1], [output], generate_backward=True) df = spark.createDataFrame( [(Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 0.0), (Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 0.0)], ["features", "label"]) print("before training:") NNModel(net).transform(df).show() classifier = NNClassifier(net, MSECriterion()) \ .setBatchSize(4) \ .setOptimMethod(Adam()) \ .setLearningRate(0.1) \