def test_tensor_array_scalar_cast(): outer_dim = 3 inner_shape = (1,) shape = (outer_dim,) + inner_shape num_items = np.prod(np.array(shape)) arr = np.arange(num_items).reshape(shape) t_arr = TensorArray(arr) for t_arr_elem, arr_elem in zip(t_arr, arr): assert float(t_arr_elem) == float(arr_elem) arr = np.arange(1).reshape((1, 1, 1)) t_arr = TensorArray(arr) assert float(t_arr) == float(arr)
def test_dict_pandas(): input_data = {"x": np.array([1, 2, 3])} expected_output = pd.DataFrame({"x": TensorArray(input_data["x"])}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) output_array = convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY) assert np.array_equal(output_array, input_data["x"])
def test_dict_multi_dim_to_pandas(): tensor = np.arange(12).reshape((3, 2, 2)) input_data = {"x": tensor} expected_output = pd.DataFrame({"x": TensorArray(tensor)}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) output_array = convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY) assert np.array_equal(output_array, input_data["x"])
def test_numpy_object_pandas(): input_data = np.array([[1, 2, 3], [1]], dtype=object) expected_output = pd.DataFrame( {TENSOR_COLUMN_NAME: TensorArray(input_data)}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert np.array_equal( convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY), input_data)
def test_numpy_multi_dim_pandas(): input_data = np.arange(12).reshape((3, 2, 2)) expected_output = pd.DataFrame( {TENSOR_COLUMN_NAME: TensorArray(input_data)}) actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert np.array_equal( convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY), input_data)
def test_arrow_tensor_pandas(): np_array = np.array([1, 2, 3]) df = pd.DataFrame({"x": TensorArray(np_array)}) input_data = pa.Table.from_arrays([ArrowTensorArray.from_numpy(np_array)], names=["x"]) expected_output = df actual_output = convert_batch_type_to_pandas(input_data) assert expected_output.equals(actual_output) assert convert_pandas_to_batch_type(actual_output, type=DataType.ARROW).equals(input_data)
def test_dict_pandas_multi_column(): array_dict = {"x": np.array([1, 2, 3]), "y": np.array([4, 5, 6])} expected_output = pd.DataFrame( {k: TensorArray(v) for k, v in array_dict.items()}) actual_output = convert_batch_type_to_pandas(array_dict) assert expected_output.equals(actual_output) output_dict = convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY) for k, v in output_dict.items(): assert np.array_equal(v, array_dict[k])
def _predict_pandas( self, data: pd.DataFrame, dtype: Union[TensorDtype, Dict[str, TensorDtype]]) -> pd.DataFrame: tensors = convert_pandas_to_batch_type(data, DataType.NUMPY) # Single numpy array. if isinstance(tensors, np.ndarray): column_name = data.columns[0] if isinstance(dtype, dict): dtype = dtype[column_name] model_input = self._array_to_tensor(tensors, dtype) else: model_input = { k: self._array_to_tensor( v, dtype=dtype[k] if isinstance(dtype, dict) else dtype) for k, v in tensors.items() } output = self._model_predict(model_input) # Handle model multi-output. For example if model outputs 2 images. if isinstance(output, dict): return pd.DataFrame( {k: TensorArray(self._tensor_to_array(v)) for k, v in output}) elif isinstance(output, list) or isinstance(output, tuple): tensor_name = "output_" output_dict = {} for i in range(len(output)): output_dict[tensor_name + str(i + 1).zfill(5)] = TensorArray( self._tensor_to_array(output[i])) return pd.DataFrame(output_dict) else: return pd.DataFrame( {"predictions": TensorArray(self._tensor_to_array(output))}, columns=["predictions"], )
def test_tensor_array_dataframe_repr(): outer_dim = 3 inner_shape = (2, 2) shape = (outer_dim,) + inner_shape num_items = np.prod(np.array(shape)) arr = np.arange(num_items).reshape(shape) t_arr = TensorArray(arr) df = pd.DataFrame({"a": t_arr}) expected_repr = """ a 0 [[ 0, 1], [ 2, 3]] 1 [[ 4, 5], [ 6, 7]] 2 [[ 8, 9], [10, 11]]""" assert repr(df) == expected_repr
def preprocess(df: pd.DataFrame) -> pd.DataFrame: """ User Pytorch code to transform user image. Note we still use pandas as intermediate format to hold images as shorthand of python dictionary. """ preprocess = transforms.Compose( [ transforms.ToTensor(), transforms.Resize(256), transforms.CenterCrop(224), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) df["image"] = TensorArray([preprocess(x.to_numpy()) for x in df["image"]]) return df
def convert_batch_type_to_pandas(data: DataBatchType) -> pd.DataFrame: """Convert the provided data to a Pandas DataFrame. Args: data: Data of type DataBatchType Returns: A pandas Dataframe representation of the input data. """ from ray.air.util.tensor_extensions.pandas import TensorArray if isinstance(data, pd.DataFrame): return data elif isinstance(data, np.ndarray): return pd.DataFrame({TENSOR_COLUMN_NAME: TensorArray(data)}) elif isinstance(data, dict): tensor_dict = {} for k, v in data.items(): if not isinstance(v, np.ndarray): raise ValueError( "All values in the provided dict must be of type " f"np.ndarray. Found type {type(v)} for key {k} " f"instead.") # Convert numpy arrays to TensorArray. tensor_dict[k] = TensorArray(v) return pd.DataFrame(tensor_dict) elif pyarrow is not None and isinstance(data, pyarrow.Table): return data.to_pandas() else: raise ValueError( f"Received data of type: {type(data)}, but expected it to be one " f"of {DataBatchType}")
def preprocess_image_with_label(df: pd.DataFrame) -> pd.DataFrame: """ User Pytorch code to transform user image. Note we still use TensorArray as intermediate format to hold images for now. """ preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Resize(256), transforms.CenterCrop(224), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) df["image"] = TensorArray( [preprocess(image.to_numpy()) for image in df["image"]]) # Fix fixed synthetic value for perf benchmark purpose df["label"] = df["label"].map(lambda _: 1) return df
def test_tensor_array_reductions(): outer_dim = 3 inner_shape = (2, 2, 2) shape = (outer_dim,) + inner_shape num_items = np.prod(np.array(shape)) arr = np.arange(num_items).reshape(shape) df = pd.DataFrame({"one": list(range(outer_dim)), "two": TensorArray(arr)}) # Reduction tests, using NumPy as the groundtruth. for name, reducer in TensorArray.SUPPORTED_REDUCERS.items(): np_kwargs = {} if name in ("std", "var"): # Pandas uses a ddof default of 1 while NumPy uses 0. # Give NumPy a ddof kwarg of 1 in order to ensure equivalent # standard deviation calculations. np_kwargs["ddof"] = 1 np.testing.assert_equal(df["two"].agg(name), reducer(arr, axis=0, **np_kwargs))
def test_tensor_array_array_protocol(): outer_dim = 3 inner_shape = (2, 2, 2) shape = (outer_dim,) + inner_shape num_items = np.prod(np.array(shape)) arr = np.arange(num_items).reshape(shape) t_arr = TensorArray(arr) np.testing.assert_array_equal( np.asarray(t_arr, dtype=np.float32), arr.astype(np.float32) ) t_arr_elem = t_arr[0] np.testing.assert_array_equal( np.asarray(t_arr_elem, dtype=np.float32), arr[0].astype(np.float32) )
def test_tensor_array_ops(): outer_dim = 3 inner_shape = (2, 2, 2) shape = (outer_dim,) + inner_shape num_items = np.prod(np.array(shape)) arr = np.arange(num_items).reshape(shape) df = pd.DataFrame({"one": [1, 2, 3], "two": TensorArray(arr)}) def apply_arithmetic_ops(arr): return 2 * (arr + 1) / 3 def apply_comparison_ops(arr): return arr % 2 == 0 def apply_logical_ops(arr): return arr & (3 * arr) | (5 * arr) # Op tests, using NumPy as the groundtruth. np.testing.assert_equal(apply_arithmetic_ops(arr), apply_arithmetic_ops(df["two"])) np.testing.assert_equal(apply_comparison_ops(arr), apply_comparison_ops(df["two"])) np.testing.assert_equal(apply_logical_ops(arr), apply_logical_ops(df["two"]))
def untensorize(torch_tensor): numpy_array = torch_tensor.cpu().detach().numpy() return TensorArray(numpy_array)