def _transform_batch( self, func: Callable[..., pd.Series], return_type: Optional[Union[SeriesType, ScalarType]]) -> "Series": from pyspark.pandas.groupby import GroupBy from pyspark.pandas.series import Series, first_series from pyspark import pandas as ps if not isinstance(func, FunctionType): f = func func = lambda *args, **kwargs: f(*args, **kwargs) if return_type is None: # TODO: In this case, it avoids the shortcut for now (but only infers schema) # because it returns a series from a different DataFrame and it has a different # anchor. We should fix this to allow the shortcut or only allow to infer # schema. limit = ps.get_option("compute.shortcut_limit") pser = self._psser.head(limit + 1)._to_internal_pandas() transformed = pser.transform(func) psser = Series(transformed) # type: Series field = psser._internal.data_fields[0].normalize_spark_type() else: spark_return_type = return_type.spark_type dtype = return_type.dtype field = InternalField( dtype=dtype, struct_field=StructField( name=self._psser._internal.data_spark_column_names[0], dataType=spark_return_type, ), ) psdf = self._psser.to_frame() columns = psdf._internal.spark_column_names def pandas_concat(*series: pd.Series) -> pd.DataFrame: # The input can only be a DataFrame for struct from Spark 3.0. # This works around to make the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = columns return pdf def apply_func(pdf: pd.DataFrame) -> pd.DataFrame: return func(first_series(pdf)).to_frame() return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, field.spark_type)]) output_func = GroupBy._make_pandas_df_builder_func(psdf, apply_func, return_schema, retain_index=False) @pandas_udf(returnType=field.spark_type) # type: ignore def pudf(*series: pd.Series) -> pd.Series: return first_series(output_func(pandas_concat(*series))) return self._psser._with_new_scol( scol=pudf(*psdf._internal.spark_columns).alias(field.name), field=field)
def _transform_batch(self, func, return_type: Optional[Union[SeriesType, ScalarType]]): from pyspark.pandas.groupby import GroupBy from pyspark.pandas.series import Series, first_series from pyspark import pandas as ps if not isinstance(func, types.FunctionType): f = func func = lambda *args, **kwargs: f(*args, **kwargs) if return_type is None: # TODO: In this case, it avoids the shortcut for now (but only infers schema) # because it returns a series from a different DataFrame and it has a different # anchor. We should fix this to allow the shortcut or only allow to infer # schema. limit = ps.get_option("compute.shortcut_limit") pser = self._kser.head(limit + 1)._to_internal_pandas() transformed = pser.transform(func) kser = Series(transformed) # type: Series spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(kser.spark.data_type)) dtype = kser.dtype else: spark_return_type = return_type.spark_type dtype = return_type.dtype kdf = self._kser.to_frame() columns = kdf._internal.spark_column_names def pandas_concat(series): # The input can only be a DataFrame for struct from Spark 3.0. # This works around to make the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = columns return pdf def apply_func(pdf): return func(first_series(pdf)).to_frame() return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]) output_func = GroupBy._make_pandas_df_builder_func(kdf, apply_func, return_schema, retain_index=False) pudf = pandas_udf( lambda *series: first_series(output_func(pandas_concat(series))), returnType=spark_return_type, functionType=PandasUDFType.SCALAR, ) return self._kser._with_new_scol( scol=pudf(*kdf._internal.spark_columns).alias( self._kser._internal.spark_column_names[0]), dtype=dtype, )
def _auto_patch_pandas() -> None: import pandas as pd # In order to use it in test cases. global _frame_has_class_getitem global _series_has_class_getitem _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__") _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__") if sys.version_info >= (3, 7): # Just in case pandas implements '__class_getitem__' later. if not _frame_has_class_getitem: pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params) if not _series_has_class_getitem: pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)