def test_batch_prediction_from_pandas_udf(): def check_truth(df, all_true=False): if all_true: return pd.DataFrame({"bool": [True] * len(df)}) return pd.DataFrame({"bool": df["a"] == df["b"]}) batch_predictor = BatchPredictor.from_pandas_udf(check_truth) test_dataset = ray.data.from_pandas( pd.DataFrame({ "a": [1, 2, 3], "b": [1, 5, 6] })) output_ds = batch_predictor.predict(test_dataset) output = [row["bool"] for row in output_ds.take()] assert output == [True, False, False] output_ds = batch_predictor.predict(test_dataset, all_true=True) output = [row["bool"] for row in output_ds.take()] assert output == [True, True, True]
# {'predictions': array([-1.2789773], dtype=float32), 'label': 0} # {'predictions': array([-2.5579545], dtype=float32), 'label': 1} # {'predictions': array([-3.8369317], dtype=float32), 'label': 0} correct = predictions.map_batches(calculate_accuracy) print("Final accuracy: ", correct.mean(on="correct")) # Final accuracy: 0.5 # __compute_accuracy_end__ # __pipelined_prediction_start__ import pandas as pd import ray from ray.air import Checkpoint from ray.train.predictor import Predictor from ray.train.batch_predictor import BatchPredictor # Create a BatchPredictor that always returns `42` for each input. batch_pred = BatchPredictor.from_pandas_udf( lambda data: pd.DataFrame({"a": [42] * len(data)})) # Create a dummy dataset. ds = ray.data.range_tensor(200, parallelism=4) # Setup a prediction pipeline. pipeline = batch_pred.predict_pipelined(ds, blocks_per_window=1) for batch in pipeline.iter_batches(): print("Pipeline result", batch) # 0 42 # 1 42 # ... # __pipelined_prediction_end__