Пример #1
0
    def _transform_batch(self, func, return_type: Optional[Union[SeriesType, ScalarType]]):
        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.series import Series, first_series
        from databricks import koalas as ks

        if not isinstance(func, types.FunctionType):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_type is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ks.get_option("compute.shortcut_limit")
            pser = self._kser.head(limit + 1)._to_internal_pandas()
            transformed = pser.transform(func)
            kser = Series(transformed)  # type: Series
            spark_return_type = force_decimal_precision_scale(
                as_nullable_spark_type(kser.spark.data_type)
            )
            dtype = kser.dtype
        else:
            spark_return_type = return_type.spark_type
            dtype = return_type.dtype

        kdf = self._kser.to_frame()
        columns = kdf._internal.spark_column_names

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = columns
            return pdf

        def apply_func(pdf):
            return func(first_series(pdf)).to_frame()

        return_schema = StructType([StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)])
        output_func = GroupBy._make_pandas_df_builder_func(
            kdf, apply_func, return_schema, retain_index=False
        )

        pudf = pandas_udf(
            lambda *series: first_series(output_func(pandas_concat(series))),
            returnType=spark_return_type,
            functionType=PandasUDFType.SCALAR,
        )

        return self._kser._with_new_scol(
            scol=pudf(*kdf._internal.spark_columns).alias(
                self._kser._internal.spark_column_names[0]
            ),
            dtype=dtype,
        )
Пример #2
0
    def indexer_between_time(
        self,
        start_time: Union[datetime.time, str],
        end_time: Union[datetime.time, str],
        include_start: bool = True,
        include_end: bool = True,
    ) -> Index:
        """
        Return index locations of values between particular times of day
        (e.g., 9:00-9:30AM).

        Parameters
        ----------
        start_time, end_time : datetime.time, str
            Time passed either as object (datetime.time) or as string in
            appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
            "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p").
        include_start : bool, default True
        include_end : bool, default True

        Returns
        -------
        values_between_time : Index of integers

        Examples
        --------
        >>> kidx = ks.date_range("2000-01-01", periods=3, freq="T")
        >>> kidx  # doctest: +NORMALIZE_WHITESPACE
        DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                       '2000-01-01 00:02:00'],
                      dtype='datetime64[ns]', freq=None)

        >>> kidx.indexer_between_time("00:01", "00:02").sort_values()
        Int64Index([1, 2], dtype='int64')

        >>> kidx.indexer_between_time("00:01", "00:02", include_end=False)
        Int64Index([1], dtype='int64')

        >>> kidx.indexer_between_time("00:01", "00:02", include_start=False)
        Int64Index([2], dtype='int64')
        """
        def pandas_between_time(pdf) -> ks.DataFrame[int]:
            return pdf.between_time(start_time, end_time, include_start,
                                    include_end)

        kdf = self.to_frame()[[]]
        id_column_name = verify_temp_column_name(kdf, "__id_column__")
        kdf = kdf.koalas.attach_id_column("distributed-sequence",
                                          id_column_name)
        with ks.option_context("compute.default_index_type", "distributed"):
            # The attached index in the statement below will be dropped soon,
            # so we enforce “distributed” default index type
            kdf = kdf.koalas.apply_batch(pandas_between_time)
        return ks.Index(first_series(kdf).rename(self.name))
Пример #3
0
    def analyzed(self) -> "ks.Series":
        """
        Returns a new Series with the analyzed Spark DataFrame.

        After multiple operations, the underlying Spark plan could grow huge
        and make the Spark planner take a long time to finish the planning.

        This function is for the workaround to avoid it.

        .. note:: After analyzed, operations between the analyzed Series and the original one
            will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.

        Returns
        -------
        Series

        Examples
        --------
        >>> ser = ks.Series([1, 2, 3])
        >>> ser
        0    1
        1    2
        2    3
        dtype: int64

        The analyzed one should return the same value.

        >>> ser.spark.analyzed
        0    1
        1    2
        2    3
        dtype: int64

        However, it won't work with the same anchor Series.

        >>> ser + ser.spark.analyzed
        Traceback (most recent call last):
        ...
        ValueError: ... enable 'compute.ops_on_diff_frames' option.

        >>> with ks.option_context('compute.ops_on_diff_frames', True):
        ...     (ser + ser.spark.analyzed).sort_index()
        0    2
        1    4
        2    6
        dtype: int64
        """
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import first_series

        return first_series(DataFrame(self._data._internal.resolved_copy))
Пример #4
0
def align_diff_series(func, this_series, *args, how="full"):
    from databricks.koalas.base import IndexOpsMixin
    from databricks.koalas.frame import DataFrame
    from databricks.koalas.series import first_series

    cols = [arg for arg in args if isinstance(arg, IndexOpsMixin)]
    combined = combine_frames(this_series.to_frame(), *cols, how=how)

    scol = func(combined["this"]._internal.data_spark_columns[0],
                *combined["that"]._internal.data_spark_columns)

    internal = combined._internal.copy(
        column_labels=this_series._internal.column_labels,
        data_spark_columns=[scol.alias(name_like_string(this_series.name))],
    )
    return first_series(DataFrame(internal))
Пример #5
0
    def indexer_at_time(self, time: Union[datetime.time, str], asof: bool = False) -> Index:
        """
        Return index locations of values at particular time of day
        (e.g. 9:30AM).

        Parameters
        ----------
        time : datetime.time or str
            Time passed in either as object (datetime.time) or as string in
            appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p",
            "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p").

        Returns
        -------
        values_at_time : Index of integers

        Examples
        --------
        >>> kidx = ks.date_range("2000-01-01", periods=3, freq="T")
        >>> kidx  # doctest: +NORMALIZE_WHITESPACE
        DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00',
                       '2000-01-01 00:02:00'],
                      dtype='datetime64[ns]', freq=None)

        >>> kidx.indexer_at_time("00:00")
        Int64Index([0], dtype='int64')

        >>> kidx.indexer_at_time("00:01")
        Int64Index([1], dtype='int64')
        """
        if asof:
            raise NotImplementedError("'asof' argument is not supported")

        def pandas_at_time(pdf) -> ks.DataFrame[int]:
            return pdf.at_time(time, asof)

        kdf = self.to_frame()[[]]
        id_column_name = verify_temp_column_name(kdf, "__id_column__")
        kdf = kdf.koalas.attach_id_column("distributed-sequence", id_column_name)
        with ks.option_context("compute.default_index_type", "distributed"):
            # The attached index in the statement below will be dropped soon,
            # so we enforce “distributed” default index type
            kdf = kdf.koalas.apply_batch(pandas_at_time)
        return ks.Index(first_series(kdf).rename(self.name))
Пример #6
0
    def _is_monotonic_decreasing(self):
        scol = self.spark.column
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
        prev = F.lag(scol, 1).over(window)

        cond = F.lit(True)
        has_not_null = F.lit(True)
        for field in self.spark.data_type[::-1]:
            left = scol.getField(field.name)
            right = prev.getField(field.name)
            compare = MultiIndex._comparator_for_monotonic_decreasing(
                field.dataType)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & left.isNotNull()
            cond = F.when(left.eqNullSafe(right), cond).otherwise(
                compare(left, right, spark.Column.__lt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(
                self._internal.index_spark_columns),
            "__is_monotonic_decreasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)])

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_dtypes=self._internal.index_dtypes,
        )

        return first_series(DataFrame(internal))
Пример #7
0
    def predict(self, data):
        """
        Returns a prediction on the data.

        If the data is a koalas DataFrame, the return is a Koalas Series.

        If the data is a pandas Dataframe, the return is the expected output of the underlying
        pyfunc object (typically a pandas Series or a numpy array).
        """
        if isinstance(data, pd.DataFrame):
            return self._model.predict(data)
        if isinstance(data, DataFrame):
            return_col = self._model_udf(*data._internal.data_spark_columns)
            # TODO: the columns should be named according to the mlflow spec
            # However, this is only possible with spark >= 3.0
            # s = F.struct(*data.columns)
            # return_col = self._model_udf(s)
            column_labels = [(col, )
                             for col in data._internal.spark_frame.select(
                                 return_col).columns]
            internal = data._internal.copy(column_labels=column_labels,
                                           data_spark_columns=[return_col])
            return first_series(DataFrame(internal))
Пример #8
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        """
        Return a Series containing counts of unique values.
        The resulting object will be in descending order so that the
        first element is the most frequently-occurring element.
        Excludes NA values by default.

        Parameters
        ----------
        normalize : boolean, default False
            If True then the object returned will contain the relative
            frequencies of the unique values.
        sort : boolean, default True
            Sort by values.
        ascending : boolean, default False
            Sort in ascending order.
        bins : Not Yet Supported
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.count: Number of non-NA elements in a Series.

        Examples
        --------
        For Series

        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
        >>> df.x.value_counts()  # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        Name: x, dtype: int64

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> df.x.value_counts(normalize=True)  # doctest: +NORMALIZE_WHITESPACE
        1.0    0.6
        0.0    0.4
        Name: x, dtype: float64

        **dropna**
        With `dropna` set to `False` we can also see NaN index values.

        >>> df.x.value_counts(dropna=False)  # doctest: +NORMALIZE_WHITESPACE
        1.0    3
        0.0    2
        NaN    1
        Name: x, dtype: int64

        For Index

        >>> idx = ks.Index([3, 1, 2, 3, 4, np.nan])
        >>> idx
        Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')

        >>> idx.value_counts().sort_index()
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        dtype: int64

        **sort**

        With `sort` set to `False`, the result wouldn't be sorted by number of count.

        >>> idx.value_counts(sort=True).sort_index()
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        dtype: int64

        **normalize**

        With `normalize` set to `True`, returns the relative frequency by
        dividing all values by the sum of values.

        >>> idx.value_counts(normalize=True).sort_index()
        1.0    0.2
        2.0    0.2
        3.0    0.4
        4.0    0.2
        dtype: float64

        **dropna**

        With `dropna` set to `False` we can also see NaN index values.

        >>> idx.value_counts(dropna=False).sort_index()  # doctest: +SKIP
        1.0    1
        2.0    1
        3.0    2
        4.0    1
        NaN    1
        dtype: int64

        For MultiIndex.

        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                       ['speed', 'weight', 'length']],
        ...                      [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                       [1, 1, 1, 1, 1, 2, 1, 2, 2]])
        >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
        >>> s.index  # doctest: +SKIP
        MultiIndex([(  'lama', 'weight'),
                    (  'lama', 'weight'),
                    (  'lama', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'length'),
                    ('falcon', 'weight'),
                    ('falcon', 'length'),
                    ('falcon', 'length')],
                   )

        >>> s.index.value_counts().sort_index()
        (cow, length)       1
        (cow, weight)       2
        (falcon, length)    2
        (falcon, weight)    1
        (lama, weight)      3
        dtype: int64

        >>> s.index.value_counts(normalize=True).sort_index()
        (cow, length)       0.111111
        (cow, weight)       0.222222
        (falcon, length)    0.222222
        (falcon, weight)    0.111111
        (lama, weight)      0.333333
        dtype: float64

        If Index has name, keep the name up.

        >>> idx = ks.Index([0, 0, 0, 1, 1, 2, 3], name='koalas')
        >>> idx.value_counts().sort_index()
        0    3
        1    2
        2    1
        3    1
        Name: koalas, dtype: int64
        """
        from databricks.koalas.series import first_series

        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._internal.spark_frame.select(
                self.spark.column).dropna()
        else:
            sdf_dropna = self._internal.spark_frame.select(self.spark.column)
        index_name = SPARK_DEFAULT_INDEX_NAME
        column_name = self._internal.data_spark_column_names[0]
        sdf = sdf_dropna.groupby(
            scol_for(sdf_dropna, column_name).alias(index_name)).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col("count"))
            else:
                sdf = sdf.orderBy(F.col("count").desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn("count", F.col("count") / F.lit(sum))

        internal = InternalFrame(
            spark_frame=sdf,
            index_map=OrderedDict({index_name: None}),
            column_labels=self._internal.column_labels,
            data_spark_columns=[scol_for(sdf, "count")],
            column_label_names=self._internal.column_label_names,
        )

        return first_series(DataFrame(internal))
Пример #9
0
    def apply(self, func) -> "ks.Series":
        """
        Applies a function that takes and returns a Spark column. It allows to natively
        apply a Spark function and column APIs with the Spark column internally used
        in Series or Index.

        .. note:: It forces to lose the index and end up with using default index. It is
            preferred to use :meth:`Series.spark.transform` or `:meth:`DataFrame.spark.apply`
            with specifying the `inedx_col`.

        .. note:: It does not require to have the same length of the input and output.
            However, it requires to create a new DataFrame internally which will require
            to set `compute.ops_on_diff_frames` to compute even with the same origin
            DataFrame that is expensive, whereas :meth:`Series.spark.transform` does not
            require it.

        Parameters
        ----------
        func : function
            Function to apply the function against the data by using Spark columns.

        Returns
        -------
        Series

        Raises
        ------
        ValueError : If the output from the function is not a Spark column.

        Examples
        --------
        >>> from databricks import koalas as ks
        >>> from pyspark.sql.functions import count, lit
        >>> df = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
        >>> df
           a  b
        0  1  4
        1  2  5
        2  3  6

        >>> df.a.spark.apply(lambda c: count(c))
        0    3
        Name: a, dtype: int64

        >>> df.a.spark.apply(lambda c: c + df.b.spark.column)
        0    5
        1    7
        2    9
        Name: a, dtype: int64
        """
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series, first_series
        from databricks.koalas.internal import HIDDEN_COLUMNS

        output = func(self._data.spark.column)
        if not isinstance(output, Column):
            raise ValueError("The output of the function [%s] should be of a "
                             "pyspark.sql.Column; however, got [%s]." %
                             (func, type(output)))
        assert isinstance(self._data, Series)

        sdf = self._data._internal.spark_frame.drop(
            *HIDDEN_COLUMNS).select(output)
        # Lose index.
        return first_series(DataFrame(sdf)).rename(self._data.name)
Пример #10
0
    def transform_batch(self, func, *args, **kwargs):
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ks.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.koalas.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ks.DataFrame['a': float, 'b': float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1


        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.koalas.apply_batch: For row/columnwise operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ks.DataFrame[int, int]:
        ...     return pdf + 1
        >>> df.koalas.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ks.DataFrame['A': int, 'B': int]:
        ...     return pdf + 1
        >>> df.koalas.transform_batch(plus_one_func)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> def plus_one_func(pdf) -> ks.Series[int]:
        ...     return pdf.B + 1
        >>> df.koalas.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int32

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).koalas.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.koalas.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.koalas.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import first_series
        from databricks import koalas as ks

        assert callable(
            func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        names = self._kdf._internal.to_internal_spark_frame.schema.names
        should_by_pass = LooseVersion(pyspark.__version__) >= "3.0"

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf = pdf.rename(columns=dict(zip(pdf.columns, names)))
            return pdf

        def pandas_extract(pdf, name):
            # This is for output to work around a DataFrame for struct
            # from Spark 3.0.  See SPARK-23836
            return pdf[name]

        def pandas_series_func(f):
            ff = f
            return lambda *series: ff(pandas_concat(series))

        def pandas_frame_func(f):
            ff = f
            return lambda *series: pandas_extract(ff(pandas_concat(series)),
                                                  field.name)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ks.get_option("compute.shortcut_limit")
            pdf = self._kdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed))
            if len(transformed) != len(pdf):
                raise ValueError(
                    "transform_batch cannot produce aggregated results")
            kdf_or_kser = ks.from_pandas(transformed)

            if isinstance(kdf_or_kser, ks.Series):
                kser = kdf_or_kser
                pudf = pandas_udf(
                    func if should_by_pass else pandas_series_func(func),
                    returnType=kser.spark.data_type,
                    functionType=PandasUDFType.SCALAR,
                )
                columns = self._kdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._kdf._internal.copy(
                    column_labels=kser._internal.column_labels,
                    data_spark_columns=[
                        (pudf(F.struct(*columns)) if should_by_pass else pudf(
                            *columns)).alias(
                                kser._internal.data_spark_column_names[0])
                    ],
                    column_label_names=kser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                kdf = kdf_or_kser
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return kdf

                return_schema = kdf._internal.to_internal_spark_frame.schema
                # Force nullability.
                return_schema = StructType([
                    StructField(field.name, field.dataType)
                    for field in return_schema.fields
                ])

                self_applied = DataFrame(self._kdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                columns = self_applied._internal.spark_columns
                if should_by_pass:
                    pudf = pandas_udf(output_func,
                                      returnType=return_schema,
                                      functionType=PandasUDFType.SCALAR)
                    temp_struct_column = verify_temp_column_name(
                        self_applied._internal.spark_frame, "__temp_struct__")
                    applied = pudf(
                        F.struct(*columns)).alias(temp_struct_column)
                    sdf = self_applied._internal.spark_frame.select(applied)
                    sdf = sdf.selectExpr("%s.*" % temp_struct_column)
                else:
                    applied = []
                    for field in return_schema.fields:
                        applied.append(
                            pandas_udf(
                                pandas_frame_func(output_func),
                                returnType=field.dataType,
                                functionType=PandasUDFType.SCALAR,
                            )(*columns).alias(field.name))
                    sdf = self_applied._internal.spark_frame.select(*applied)
                return DataFrame(kdf._internal.with_new_sdf(sdf))
        else:
            return_type = infer_return_type(original_func)
            return_schema = return_type.tpe
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig)
            if is_return_series:
                pudf = pandas_udf(
                    func if should_by_pass else pandas_series_func(func),
                    returnType=return_schema,
                    functionType=PandasUDFType.SCALAR,
                )
                columns = self._kdf._internal.spark_columns
                internal = self._kdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[
                        (pudf(F.struct(*columns)) if should_by_pass else pudf(
                            *columns)).alias(SPARK_DEFAULT_SERIES_NAME)
                    ],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                self_applied = DataFrame(self._kdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                columns = self_applied._internal.spark_columns

                if should_by_pass:
                    pudf = pandas_udf(output_func,
                                      returnType=return_schema,
                                      functionType=PandasUDFType.SCALAR)
                    temp_struct_column = verify_temp_column_name(
                        self_applied._internal.spark_frame, "__temp_struct__")
                    applied = pudf(
                        F.struct(*columns)).alias(temp_struct_column)
                    sdf = self_applied._internal.spark_frame.select(applied)
                    sdf = sdf.selectExpr("%s.*" % temp_struct_column)
                else:
                    applied = []
                    for field in return_schema.fields:
                        applied.append(
                            pandas_udf(
                                pandas_frame_func(output_func),
                                returnType=field.dataType,
                                functionType=PandasUDFType.SCALAR,
                            )(*columns).alias(field.name))
                    sdf = self_applied._internal.spark_frame.select(*applied)
                return DataFrame(sdf)
Пример #11
0
 def apply_func(pdf):
     return func(first_series(pdf)).to_frame()
Пример #12
0
 def pandas_series_func(f, by_pass):
     ff = f
     if by_pass:
         return lambda *series: first_series(ff(*series))
     else:
         return lambda *series: first_series(ff(pandas_concat(series)))