def test_type_annotation_scalar_iter(self): def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR_ITER) def func( iter: Iterator[Tuple[pd.DataFrame, pd.Series]] ) -> Iterator[pd.DataFrame]: pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR_ITER) def func( iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR_ITER) def func( iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]] ) -> Iterator[pd.Series]: pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR_ITER)
def test_type_annotation_scalar_iter(self): exec( "from typing import Iterator\n" "def func(iter: Iterator[pd.Series]) -> Iterator[pd.Series]: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) exec( "from typing import Iterator, Tuple\n" "def func(iter: Iterator[Tuple[pd.DataFrame, pd.Series]]) -> Iterator[pd.DataFrame]:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) exec( "from typing import Iterator, Tuple\n" "def func(iter: Iterator[Tuple[pd.DataFrame, ...]]) -> Iterator[pd.Series]: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER) exec( "from typing import Iterator, Tuple, Union\n" "def func(iter: Iterator[Tuple[Union[pd.DataFrame, pd.Series], ...]])" " -> Iterator[pd.Series]: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR_ITER)
def _create_pandas_udf(f, returnType, evalType): argspec = _get_argspec(f) # pandas UDF by type hints. if sys.version_info >= (3, 6): from inspect import signature if evalType in [ PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF ]: warnings.warn( "In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for " "pandas UDF instead of specifying pandas UDF type which will be deprecated " "in the future releases. See SPARK-28264 for more details.", UserWarning) elif evalType in [ PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_MAP_PANDAS_ITER_UDF, PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF ]: # In case of 'SQL_GROUPED_MAP_PANDAS_UDF', deprecation warning is being triggered # at `apply` instead. # In case of 'SQL_MAP_PANDAS_ITER_UDF' and 'SQL_COGROUPED_MAP_PANDAS_UDF', the # evaluation type will always be set. pass elif len(argspec.annotations) > 0: evalType = infer_eval_type(signature(f)) assert evalType is not None if evalType is None: # Set default is scalar UDF. evalType = PythonEvalType.SQL_SCALAR_PANDAS_UDF if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF) and \ len(argspec.args) == 0 and \ argspec.varargs is None: raise ValueError( "Invalid function: 0-arg pandas_udfs are not supported. " "Instead, create a 1-arg pandas_udf and ignore the arg in your function." ) if evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF \ and len(argspec.args) not in (1, 2): raise ValueError( "Invalid function: pandas_udf with function type GROUPED_MAP or " "the function in groupby.applyInPandas " "must take either one argument (data) or two arguments (key, data)." ) if evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF \ and len(argspec.args) not in (2, 3): raise ValueError( "Invalid function: the function in cogroup.applyInPandas " "must take either two arguments (left, right) " "or three arguments (key, left, right).") return _create_udf(f, returnType, evalType)
def test_type_annotation_group_agg(self): def func(col: pd.Series) -> str: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) def func(col: pd.DataFrame, col1: pd.Series) -> int: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) def func(col: pd.DataFrame, *args: pd.Series) -> Row: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) def func(col: pd.Series, *, col2: pd.DataFrame) -> float: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG) def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.GROUPED_AGG)
def test_type_annotation_scalar(self): def func(col: pd.Series) -> pd.Series: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR) def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series: pass self.assertEqual(infer_eval_type(inspect.signature(func)), PandasUDFType.SCALAR)
def test_string_type_annotation(self): def func(col: "pd.Series") -> "pd.Series": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR) def func(col: "pd.DataFrame", col1: "pd.Series") -> "pd.DataFrame": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR) def func(col: "pd.DataFrame", *args: "pd.Series") -> "pd.Series": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR) def func(col: "pd.Series", *args: "pd.Series", **kwargs: "pd.DataFrame") -> "pd.Series": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR) def func(col: "pd.Series", *, col2: "pd.DataFrame") -> "pd.DataFrame": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR) def func(col: Union["pd.Series", "pd.DataFrame"], *, col2: "pd.DataFrame") -> "pd.Series": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR) def func(col: "Union[pd.Series, pd.DataFrame]", *, col2: "pd.DataFrame") -> "pd.Series": pass self.assertEqual( infer_eval_type(signature(func), get_type_hints(func)), PandasUDFType.SCALAR)
def test_type_annotation_scalar(self): exec("def func(col: pd.Series) -> pd.Series: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) exec( "def func(col: pd.DataFrame, col1: pd.Series) -> pd.DataFrame: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) exec( "def func(col: pd.DataFrame, *args: pd.Series) -> pd.Series: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) exec( "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> pd.Series:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) exec( "def func(col: pd.Series, *, col2: pd.DataFrame) -> pd.DataFrame:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR) exec( "from typing import Union\n" "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> pd.Series:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.SCALAR)
def test_type_annotation_group_agg(self): exec("def func(col: pd.Series) -> str: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) exec("def func(col: pd.DataFrame, col1: pd.Series) -> int: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) exec( "from pyspark.sql import Row\n" "def func(col: pd.DataFrame, *args: pd.Series) -> Row: pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) exec( "def func(col: pd.Series, *args: pd.Series, **kwargs: pd.DataFrame) -> str:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) exec( "def func(col: pd.Series, *, col2: pd.DataFrame) -> float:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG) exec( "from typing import Union\n" "def func(col: Union[pd.Series, pd.DataFrame], *, col2: pd.DataFrame) -> float:\n" " pass", self.local) self.assertEqual( infer_eval_type(inspect.signature(self.local['func'])), PandasUDFType.GROUPED_AGG)