Exemplo n.º 1
0
    def _transform_batch(
            self, func: Callable[..., pd.Series],
            return_type: Optional[Union[SeriesType, ScalarType]]) -> "Series":
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.series import Series, first_series
        from pyspark import pandas as ps

        if not isinstance(func, FunctionType):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_type is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ps.get_option("compute.shortcut_limit")
            pser = self._psser.head(limit + 1)._to_internal_pandas()
            transformed = pser.transform(func)
            psser = Series(transformed)  # type: Series

            field = psser._internal.data_fields[0].normalize_spark_type()
        else:
            spark_return_type = return_type.spark_type
            dtype = return_type.dtype
            field = InternalField(
                dtype=dtype,
                struct_field=StructField(
                    name=self._psser._internal.data_spark_column_names[0],
                    dataType=spark_return_type,
                ),
            )

        psdf = self._psser.to_frame()
        columns = psdf._internal.spark_column_names

        def pandas_concat(*series: pd.Series) -> pd.DataFrame:
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = columns
            return pdf

        def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
            return func(first_series(pdf)).to_frame()

        return_schema = StructType(
            [StructField(SPARK_DEFAULT_SERIES_NAME, field.spark_type)])
        output_func = GroupBy._make_pandas_df_builder_func(psdf,
                                                           apply_func,
                                                           return_schema,
                                                           retain_index=False)

        @pandas_udf(returnType=field.spark_type)  # type: ignore
        def pudf(*series: pd.Series) -> pd.Series:
            return first_series(output_func(pandas_concat(*series)))

        return self._psser._with_new_scol(
            scol=pudf(*psdf._internal.spark_columns).alias(field.name),
            field=field)
Exemplo n.º 2
0
    def _transform_batch(self, func, return_type: Optional[Union[SeriesType,
                                                                 ScalarType]]):
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.series import Series, first_series
        from pyspark import pandas as ps

        if not isinstance(func, types.FunctionType):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_type is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ps.get_option("compute.shortcut_limit")
            pser = self._kser.head(limit + 1)._to_internal_pandas()
            transformed = pser.transform(func)
            kser = Series(transformed)  # type: Series
            spark_return_type = force_decimal_precision_scale(
                as_nullable_spark_type(kser.spark.data_type))
            dtype = kser.dtype
        else:
            spark_return_type = return_type.spark_type
            dtype = return_type.dtype

        kdf = self._kser.to_frame()
        columns = kdf._internal.spark_column_names

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = columns
            return pdf

        def apply_func(pdf):
            return func(first_series(pdf)).to_frame()

        return_schema = StructType(
            [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)])
        output_func = GroupBy._make_pandas_df_builder_func(kdf,
                                                           apply_func,
                                                           return_schema,
                                                           retain_index=False)

        pudf = pandas_udf(
            lambda *series: first_series(output_func(pandas_concat(series))),
            returnType=spark_return_type,
            functionType=PandasUDFType.SCALAR,
        )

        return self._kser._with_new_scol(
            scol=pudf(*kdf._internal.spark_columns).alias(
                self._kser._internal.spark_column_names[0]),
            dtype=dtype,
        )
Exemplo n.º 3
0
def _auto_patch_pandas() -> None:
    import pandas as pd

    # In order to use it in test cases.
    global _frame_has_class_getitem
    global _series_has_class_getitem

    _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__")
    _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__")

    if sys.version_info >= (3, 7):
        # Just in case pandas implements '__class_getitem__' later.
        if not _frame_has_class_getitem:
            pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params)

        if not _series_has_class_getitem:
            pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)