예제 #1
0
    def _set_ordered(self, *, ordered: bool,
                     inplace: bool) -> Optional["ps.Series"]:
        from pyspark.pandas.frame import DataFrame

        if self.ordered == ordered:
            if inplace:
                return None
            else:
                psser = self._data
        else:
            internal = self._data._psdf._internal.with_new_spark_column(
                self._data._column_label,
                self._data.spark.column,
                field=self._data._internal.data_fields[0].copy(
                    dtype=CategoricalDtype(categories=self.categories,
                                           ordered=ordered)),
            )
            if inplace:
                self._data._psdf._update_internal_frame(internal)
                return None
            else:
                psser = DataFrame(internal)._psser_for(
                    self._data._column_label)

        return psser._with_new_scol(psser.spark.column,
                                    field=psser._internal.data_fields[0])
예제 #2
0
    def execute(self) -> DataFrame:
        """
        Returns a DataFrame for which the SQL statement has been executed by
        the underlying SQL engine.

        >>> str0 = 'abc'
        >>> ps.sql("select {str0}")
           abc
        0  abc

        >>> str1 = 'abc"abc'
        >>> str2 = "abc'abc"
        >>> ps.sql("select {str0}, {str1}, {str2}")
           abc  abc"abc  abc'abc
        0  abc  abc"abc  abc'abc

        >>> strs = ['a', 'b']
        >>> ps.sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2")
           cond1  cond2
        0   True  False
        """
        blocks = _string.formatter_parser(self._statement)
        # TODO: use a string builder
        res = ""
        try:
            for (pre, inner, _, _) in blocks:
                var_next = "" if inner is None else self._convert(inner)
                res = res + pre + var_next
            self._normalized_statement = res

            sdf = self._session.sql(self._normalized_statement)
        finally:
            for v in self._temp_views:
                self._session.catalog.dropTempView(v)
        return DataFrame(sdf)
예제 #3
0
    def _is_monotonic_decreasing(self) -> Series:
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)

        cond = SF.lit(True)
        has_not_null = SF.lit(True)
        for scol in self._internal.index_spark_columns[::-1]:
            data_type = self._internal.spark_type_for(scol)
            prev = F.lag(scol, 1).over(window)
            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & scol.isNotNull()
            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__lt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(self._internal.index_spark_columns),
            "__is_monotonic_decreasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)]
        )

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=self._internal.index_fields,
        )

        return first_series(DataFrame(internal))
예제 #4
0
    def predict(
            self, data: Union[DataFrame,
                              pd.DataFrame]) -> Union[Series, pd.Series]:
        """
        Returns a prediction on the data.

        If the data is a pandas-on-Spark DataFrame, the return is a pandas-on-Spark Series.

        If the data is a pandas Dataframe, the return is the expected output of the underlying
        pyfunc object (typically a pandas Series or a numpy array).
        """
        if isinstance(data, pd.DataFrame):
            return self._model.predict(data)
        elif isinstance(data, DataFrame):
            return_col = self._model_udf(*data._internal.data_spark_columns)
            # TODO: the columns should be named according to the mlflow spec
            # However, this is only possible with spark >= 3.0
            # s = F.struct(*data.columns)
            # return_col = self._model_udf(s)
            column_labels: List[Label] = [
                (col, ) for col in data._internal.spark_frame.select(
                    return_col).columns
            ]
            internal = data._internal.copy(column_labels=column_labels,
                                           data_spark_columns=[return_col],
                                           data_fields=None)
            return first_series(DataFrame(internal))
        else:
            raise ValueError("unknown data type: {}".format(
                type(data).__name__))
예제 #5
0
    def intersection(self, other: Union[DataFrame, Series, Index, List]) -> "MultiIndex":
        """
        Form the intersection of two Index objects.

        This returns a new Index with elements common to the index and `other`.

        Parameters
        ----------
        other : Index or array-like

        Returns
        -------
        intersection : MultiIndex

        Examples
        --------
        >>> midx1 = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> midx2 = ps.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        if isinstance(other, Series) or not is_list_like(other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        elif isinstance(other, DataFrame):
            raise ValueError("Index data must be 1-dimensional")
        elif isinstance(other, MultiIndex):
            spark_frame_other = other.to_frame().to_spark()
            keep_name = self.names == other.names
        elif isinstance(other, Index):
            # Always returns an empty MultiIndex if `other` is Index.
            return cast(MultiIndex, self.to_frame().head(0).index)
        elif not all(isinstance(item, tuple) for item in other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        else:
            other = MultiIndex.from_tuples(list(other))
            spark_frame_other = cast(MultiIndex, other).to_frame().to_spark()
            keep_name = True

        index_fields = self._index_fields_for_union_like(other, func_name="intersection")

        default_name: List[Name] = [SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)]
        spark_frame_self = self.to_frame(name=default_name).to_spark()
        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
        if keep_name:
            index_names = self._internal.index_names
        else:
            index_names = None

        internal = InternalFrame(
            spark_frame=spark_frame_intersected,
            index_spark_columns=[
                scol_for(spark_frame_intersected, cast(str, col)) for col in default_name
            ],
            index_names=index_names,
            index_fields=index_fields,
        )
        return cast(MultiIndex, DataFrame(internal).index)
예제 #6
0
    def analyzed(self) -> "ps.Series":
        """
        Returns a new Series with the analyzed Spark DataFrame.

        After multiple operations, the underlying Spark plan could grow huge
        and make the Spark planner take a long time to finish the planning.

        This function is for the workaround to avoid it.

        .. note:: After analyzed, operations between the analyzed Series and the original one
            will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.

        Returns
        -------
        Series

        Examples
        --------
        >>> ser = ps.Series([1, 2, 3])
        >>> ser
        0    1
        1    2
        2    3
        dtype: int64

        The analyzed one should return the same value.

        >>> ser.spark.analyzed
        0    1
        1    2
        2    3
        dtype: int64

        However, it won't work with the same anchor Series.

        >>> ser + ser.spark.analyzed
        Traceback (most recent call last):
        ...
        ValueError: ... enable 'compute.ops_on_diff_frames' option.

        >>> with ps.option_context('compute.ops_on_diff_frames', True):
        ...     (ser + ser.spark.analyzed).sort_index()
        0    2
        1    4
        2    6
        dtype: int64
        """
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series

        return first_series(DataFrame(self._data._internal.resolved_copy))
예제 #7
0
    def coalesce(self, num_partitions: int) -> "ps.DataFrame":
        """
        Returns a new DataFrame that has exactly `num_partitions` partitions.

        .. note:: This operation results in a narrow dependency, e.g. if you go from 1000
            partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new
            partitions will claim 10 of the current partitions. If a larger number of partitions is
            requested, it will stay at the current number of partitions. However, if you're doing a
            drastic coalesce, e.g. to num_partitions = 1, this may result in your computation taking
            place on fewer nodes than you like (e.g. one node in the case of num_partitions = 1). To
            avoid this, you can call repartition(). This will add a shuffle step, but means the
            current upstream partitions will be executed in parallel (per whatever the current
            partitioning is).

        Parameters
        ----------
        num_partitions : int
            The target number of partitions.

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> kdf = ps.DataFrame({"age": [5, 5, 2, 2],
        ...         "name": ["Bob", "Bob", "Alice", "Alice"]}).set_index("age")
        >>> kdf.sort_index()  # doctest: +NORMALIZE_WHITESPACE
              name
        age
        2    Alice
        2    Alice
        5      Bob
        5      Bob
        >>> new_kdf = kdf.spark.coalesce(1)
        >>> new_kdf.to_spark().rdd.getNumPartitions()
        1
        >>> new_kdf.sort_index()   # doctest: +NORMALIZE_WHITESPACE
              name
        age
        2    Alice
        2    Alice
        5      Bob
        5      Bob
        """
        from pyspark.pandas.frame import DataFrame

        internal = self._kdf._internal.resolved_copy
        coalesced_sdf = internal.spark_frame.coalesce(num_partitions)
        return DataFrame(internal.with_new_sdf(coalesced_sdf))
예제 #8
0
    def analyzed(self) -> "ps.DataFrame":
        """
        Returns a new DataFrame with the analyzed Spark DataFrame.

        After multiple operations, the underlying Spark plan could grow huge
        and make the Spark planner take a long time to finish the planning.

        This function is for the workaround to avoid it.

        .. note:: After analyzed, operations between the analyzed DataFrame and the original one
            will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> df = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
        >>> df
           a  b
        0  1  4
        1  2  5
        2  3  6

        The analyzed one should return the same value.

        >>> df.spark.analyzed
           a  b
        0  1  4
        1  2  5
        2  3  6

        However, it won't work with the same anchor Series.

        >>> df + df.spark.analyzed
        Traceback (most recent call last):
        ...
        ValueError: ... enable 'compute.ops_on_diff_frames' option.

        >>> with ps.option_context('compute.ops_on_diff_frames', True):
        ...     (df + df.spark.analyzed).sort_index()
           a   b
        0  2   8
        1  4  10
        2  6  12
        """
        from pyspark.pandas.frame import DataFrame

        return DataFrame(self._kdf._internal.resolved_copy)
예제 #9
0
def _auto_patch_pandas() -> None:
    import pandas as pd

    # In order to use it in test cases.
    global _frame_has_class_getitem
    global _series_has_class_getitem

    _frame_has_class_getitem = hasattr(pd.DataFrame, "__class_getitem__")
    _series_has_class_getitem = hasattr(pd.Series, "__class_getitem__")

    if sys.version_info >= (3, 7):
        # Just in case pandas implements '__class_getitem__' later.
        if not _frame_has_class_getitem:
            pd.DataFrame.__class_getitem__ = lambda params: DataFrame.__class_getitem__(params)

        if not _series_has_class_getitem:
            pd.Series.__class_getitem__ = lambda params: Series.__class_getitem__(params)
예제 #10
0
    def get_level_values(self, level: Union[int, Name]) -> Index:
        """
        Return vector of label values for requested level,
        equal to the length of the index.

        Parameters
        ----------
        level : int or str
            ``level`` is either the integer position of the level in the
            MultiIndex, or the name of the level.

        Returns
        -------
        values : Index
            Values is a level of this MultiIndex converted to
            a single :class:`Index` (or subclass thereof).

        Examples
        --------

        Create a MultiIndex:

        >>> mi = ps.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'a')])
        >>> mi.names = ['level_1', 'level_2']

        Get level values by supplying level as either integer or name:

        >>> mi.get_level_values(0)
        Index(['x', 'x', 'y'], dtype='object', name='level_1')

        >>> mi.get_level_values('level_2')
        Index(['a', 'b', 'a'], dtype='object', name='level_2')
        """
        level = self._get_level_number(level)
        index_scol = self._internal.index_spark_columns[level]
        index_name = self._internal.index_names[level]
        index_field = self._internal.index_fields[level]
        internal = self._internal.copy(
            index_spark_columns=[index_scol],
            index_names=[index_name],
            index_fields=[index_field],
            column_labels=[],
            data_spark_columns=[],
            data_fields=[],
        )
        return DataFrame(internal).index
예제 #11
0
    def execute(self, index_col: Optional[Union[str, List[str]]]) -> DataFrame:
        """
        Returns a DataFrame for which the SQL statement has been executed by
        the underlying SQL engine.

        >>> from pyspark.pandas import sql_processor
        >>> # we will call 'sql_processor' directly in doctests so decrease one level.
        >>> sql_processor._CAPTURE_SCOPES = 2
        >>> sql = sql_processor.sql
        >>> str0 = 'abc'
        >>> sql("select {str0}")
           abc
        0  abc

        >>> str1 = 'abc"abc'
        >>> str2 = "abc'abc"
        >>> sql("select {str0}, {str1}, {str2}")
           abc  abc"abc  abc'abc
        0  abc  abc"abc  abc'abc

        >>> strs = ['a', 'b']
        >>> sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2")
           cond1  cond2
        0   True  False
        """
        blocks = _string.formatter_parser(self._statement)
        # TODO: use a string builder
        res = ""
        try:
            for (pre, inner, _, _) in blocks:
                var_next = "" if inner is None else self._convert(inner)
                res = res + pre + var_next
            self._normalized_statement = res

            sdf = self._session.sql(self._normalized_statement)
        finally:
            for v in self._temp_views:
                self._session.catalog.dropTempView(v)

        index_spark_columns, index_names = _get_index_map(sdf, index_col)

        return DataFrame(
            InternalFrame(spark_frame=sdf,
                          index_spark_columns=index_spark_columns,
                          index_names=index_names))
예제 #12
0
    def analyzed(self) -> "ps.Index":
        """
        Returns a new Index with the analyzed Spark DataFrame.

        After multiple operations, the underlying Spark plan could grow huge
        and make the Spark planner take a long time to finish the planning.

        This function is for the workaround to avoid it.

        .. note:: After analyzed, operations between the analyzed Series and the original one
            will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.

        Returns
        -------
        Index

        Examples
        --------
        >>> idx = ps.Index([1, 2, 3])
        >>> idx
        Int64Index([1, 2, 3], dtype='int64')

        The analyzed one should return the same value.

        >>> idx.spark.analyzed
        Int64Index([1, 2, 3], dtype='int64')

        However, it won't work with the same anchor Index.

        >>> idx + idx.spark.analyzed
        Traceback (most recent call last):
        ...
        ValueError: ... enable 'compute.ops_on_diff_frames' option.

        >>> with ps.option_context('compute.ops_on_diff_frames', True):
        ...     (idx + idx.spark.analyzed).sort_values()
        Int64Index([2, 4, 6], dtype='int64')
        """
        from pyspark.pandas.frame import DataFrame

        return DataFrame(self._data._internal.resolved_copy).index
예제 #13
0
    def repartition(self, num_partitions: int) -> "ps.DataFrame":
        """
        Returns a new DataFrame partitioned by the given partitioning expressions. The
        resulting DataFrame is hash partitioned.

        Parameters
        ----------
        num_partitions : int
            The target number of partitions.

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> kdf = ps.DataFrame({"age": [5, 5, 2, 2],
        ...         "name": ["Bob", "Bob", "Alice", "Alice"]}).set_index("age")
        >>> kdf.sort_index()  # doctest: +NORMALIZE_WHITESPACE
              name
        age
        2    Alice
        2    Alice
        5      Bob
        5      Bob
        >>> new_kdf = kdf.spark.repartition(7)
        >>> new_kdf.to_spark().rdd.getNumPartitions()
        7
        >>> new_kdf.sort_index()   # doctest: +NORMALIZE_WHITESPACE
              name
        age
        2    Alice
        2    Alice
        5      Bob
        5      Bob
        """
        from pyspark.pandas.frame import DataFrame

        internal = self._kdf._internal.resolved_copy
        repartitioned_sdf = internal.spark_frame.repartition(num_partitions)
        return DataFrame(internal.with_new_sdf(repartitioned_sdf))
예제 #14
0
    def hint(self, name: str, *parameters) -> "ps.DataFrame":
        """
        Specifies some hint on the current DataFrame.

        Parameters
        ----------
        name : A name of the hint.
        parameters : Optional parameters.

        Returns
        -------
        ret : DataFrame with the hint.

        See Also
        --------
        broadcast : Marks a DataFrame as small enough for use in broadcast joins.

        Examples
        --------
        >>> df1 = ps.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
        ...                     'value': [1, 2, 3, 5]},
        ...                    columns=['lkey', 'value']).set_index('lkey')
        >>> df2 = ps.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
        ...                     'value': [5, 6, 7, 8]},
        ...                    columns=['rkey', 'value']).set_index('rkey')
        >>> merged = df1.merge(df2.spark.hint("broadcast"), left_index=True, right_index=True)
        >>> merged.spark.explain()  # doctest: +ELLIPSIS
        == Physical Plan ==
        ...
        ...BroadcastHashJoin...
        ...
        """
        from pyspark.pandas.frame import DataFrame

        internal = self._kdf._internal.resolved_copy
        return DataFrame(
            internal.with_new_sdf(internal.spark_frame.hint(name,
                                                            *parameters)))
예제 #15
0
    def local_checkpoint(self, eager: bool = True) -> "ps.DataFrame":
        """Returns a locally checkpointed version of this DataFrame.

        Checkpointing can be used to truncate the logical plan of this DataFrame, which is
        especially useful in iterative algorithms where the plan may grow exponentially. Local
        checkpoints are stored in the executors using the caching subsystem and therefore they are
        not reliable.

        Parameters
        ----------
        eager : bool
            Whether to locally checkpoint this DataFrame immediately

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> kdf = ps.DataFrame({"a": ["a", "b", "c"]})
        >>> kdf
           a
        0  a
        1  b
        2  c
        >>> new_kdf = kdf.spark.local_checkpoint()
        >>> new_kdf
           a
        0  a
        1  b
        2  c
        """
        from pyspark.pandas.frame import DataFrame

        internal = self._kdf._internal.resolved_copy
        checkpointed_sdf = internal.spark_frame.localCheckpoint(eager)
        return DataFrame(internal.with_new_sdf(checkpointed_sdf))
예제 #16
0
    def predict(self, data):
        """
        Returns a prediction on the data.

        If the data is a pandas-on-Spark DataFrame, the return is a pandas-on-Spark Series.

        If the data is a pandas Dataframe, the return is the expected output of the underlying
        pyfunc object (typically a pandas Series or a numpy array).
        """
        if isinstance(data, pd.DataFrame):
            return self._model.predict(data)
        if isinstance(data, DataFrame):
            return_col = self._model_udf(*data._internal.data_spark_columns)
            # TODO: the columns should be named according to the mlflow spec
            # However, this is only possible with spark >= 3.0
            # s = F.struct(*data.columns)
            # return_col = self._model_udf(s)
            column_labels = [(col, )
                             for col in data._internal.spark_frame.select(
                                 return_col).columns]
            internal = data._internal.copy(column_labels=column_labels,
                                           data_spark_columns=[return_col],
                                           data_dtypes=None)
            return first_series(DataFrame(internal))
예제 #17
0
    def checkpoint(self, eager: bool = True) -> "ps.DataFrame":
        """Returns a checkpointed version of this DataFrame.

        Checkpointing can be used to truncate the logical plan of this DataFrame, which is
        especially useful in iterative algorithms where the plan may grow exponentially. It will be
        saved to files inside the checkpoint directory set with `SparkContext.setCheckpointDir`.

        Parameters
        ----------
        eager : bool
            Whether to checkpoint this DataFrame immediately

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> kdf = ps.DataFrame({"a": ["a", "b", "c"]})
        >>> kdf
           a
        0  a
        1  b
        2  c
        >>> new_kdf = kdf.spark.checkpoint()  # doctest: +SKIP
        >>> new_kdf  # doctest: +SKIP
           a
        0  a
        1  b
        2  c
        """
        from pyspark.pandas.frame import DataFrame

        internal = self._kdf._internal.resolved_copy
        checkpointed_sdf = internal.spark_frame.checkpoint(eager)
        return DataFrame(internal.with_new_sdf(checkpointed_sdf))
예제 #18
0
    def from_frame(df: DataFrame,
                   names: Optional[List[Name]] = None) -> "MultiIndex":
        """
        Make a MultiIndex from a DataFrame.

        Parameters
        ----------
        df : DataFrame
            DataFrame to be converted to MultiIndex.
        names : list-like, optional
            If no names are provided, use the column names, or tuple of column
            names if the columns is a MultiIndex. If a sequence, overwrite
            names with the given sequence.

        Returns
        -------
        MultiIndex
            The MultiIndex representation of the given DataFrame.

        See Also
        --------
        MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
        MultiIndex.from_product : Make a MultiIndex from cartesian product
                                  of iterables.

        Examples
        --------
        >>> df = ps.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
        ...                    ['NJ', 'Temp'], ['NJ', 'Precip']],
        ...                   columns=['a', 'b'])
        >>> df  # doctest: +SKIP
              a       b
        0    HI    Temp
        1    HI  Precip
        2    NJ    Temp
        3    NJ  Precip

        >>> ps.MultiIndex.from_frame(df)  # doctest: +SKIP
        MultiIndex([('HI',   'Temp'),
                    ('HI', 'Precip'),
                    ('NJ',   'Temp'),
                    ('NJ', 'Precip')],
                   names=['a', 'b'])

        Using explicit names, instead of the column names

        >>> ps.MultiIndex.from_frame(df, names=['state', 'observation'])  # doctest: +SKIP
        MultiIndex([('HI',   'Temp'),
                    ('HI', 'Precip'),
                    ('NJ',   'Temp'),
                    ('NJ', 'Precip')],
                   names=['state', 'observation'])
        """
        if not isinstance(df, DataFrame):
            raise TypeError("Input must be a DataFrame")
        sdf = df.to_spark()

        if names is None:
            names = df._internal.column_labels
        elif not is_list_like(names):
            raise TypeError("Names should be list-like for a MultiIndex")
        else:
            names = [
                name if is_name_like_tuple(name) else (name, )
                for name in names
            ]

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, col) for col in sdf.columns],
            index_names=names,
        )
        return cast(MultiIndex, DataFrame(internal).index)
예제 #19
0
파일: utils.py 프로젝트: jerqi/spark
def align_diff_frames(
    resolve_func: Callable[
        ["DataFrame", List[Tuple], List[Tuple]], Iterator[Tuple["Series", Tuple]]
    ],
    this: "DataFrame",
    that: "DataFrame",
    fillna: bool = True,
    how: str = "full",
    preserve_order_column: bool = False,
) -> "DataFrame":
    """
    This method aligns two different DataFrames with a given `func`. Columns are resolved and
    handled within the given `func`.
    To use this, `compute.ops_on_diff_frames` should be True, for now.

    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
        the column of another DataFrame. It returns an iterable that produces Series.

        >>> from pyspark.pandas.config import set_option, reset_option
        >>>
        >>> set_option("compute.ops_on_diff_frames", True)
        >>>
        >>> psdf1 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>> psdf2 = ps.DataFrame({'a': [9, 8, 7, 6, 5, 4, 3, 2, 1]})
        >>>
        >>> def func(psdf, this_column_labels, that_column_labels):
        ...    psdf  # conceptually this is A + B.
        ...
        ...    # Within this function, Series from A or B can be performed against `psdf`.
        ...    this_label = this_column_labels[0]  # this is ('a',) from psdf1.
        ...    that_label = that_column_labels[0]  # this is ('a',) from psdf2.
        ...    new_series = (psdf[this_label] - psdf[that_label]).rename(str(this_label))
        ...
        ...    # This new series will be placed in new DataFrame.
        ...    yield (new_series, this_label)
        >>>
        >>>
        >>> align_diff_frames(func, psdf1, psdf2).sort_index()
           a
        0  0
        1  0
        2  0
        3  0
        4  0
        5  0
        6  0
        7  0
        8  0
        >>> reset_option("compute.ops_on_diff_frames")

    :param this: a DataFrame to align
    :param that: another DataFrame to align
    :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
        Otherwise, it returns as are.
    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
            'that_columns' in this function are B, C and B, C.
        - left: `resolve_func` should resolve columns including that columns.
            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
            B, C but `that_columns` are B, C, D.
        - inner: Same as 'full' mode; however, internally performs inner join instead.
    :return: Aligned DataFrame
    """
    from pyspark.pandas.frame import DataFrame

    assert how == "full" or how == "left" or how == "inner"

    this_column_labels = this._internal.column_labels
    that_column_labels = that._internal.column_labels
    common_column_labels = set(this_column_labels).intersection(that_column_labels)

    # 1. Perform the join given two dataframes.
    combined = combine_frames(this, that, how=how, preserve_order_column=preserve_order_column)

    # 2. Apply the given function to transform the columns in a batch and keep the new columns.
    combined_column_labels = combined._internal.column_labels

    that_columns_to_apply = []  # type: List[Tuple]
    this_columns_to_apply = []  # type: List[Tuple]
    additional_that_columns = []  # type: List[Tuple]
    columns_to_keep = []  # type: List[Union[Series, Column]]
    column_labels_to_keep = []  # type: List[Tuple]

    for combined_label in combined_column_labels:
        for common_label in common_column_labels:
            if combined_label == tuple(["this", *common_label]):
                this_columns_to_apply.append(combined_label)
                break
            elif combined_label == tuple(["that", *common_label]):
                that_columns_to_apply.append(combined_label)
                break
        else:
            if how == "left" and combined_label in [
                tuple(["that", *label]) for label in that_column_labels
            ]:
                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
                # it later to `func`. `func` should resolve it.
                # Note that adding this into a separate list (`additional_that_columns`)
                # is intentional so that `this_columns` and `that_columns` can be paired.
                additional_that_columns.append(combined_label)
            elif fillna:
                columns_to_keep.append(SF.lit(None).cast(DoubleType()).alias(str(combined_label)))
                column_labels_to_keep.append(combined_label)
            else:
                columns_to_keep.append(combined._psser_for(combined_label))
                column_labels_to_keep.append(combined_label)

    that_columns_to_apply += additional_that_columns

    # Should extract columns to apply and do it in a batch in case
    # it adds new columns for example.
    if len(this_columns_to_apply) > 0 or len(that_columns_to_apply) > 0:
        psser_set, column_labels_set = zip(
            *resolve_func(combined, this_columns_to_apply, that_columns_to_apply)
        )
        columns_applied = list(psser_set)  # type: List[Union[Series, Column]]
        column_labels_applied = list(column_labels_set)  # type: List[Tuple]
    else:
        columns_applied = []
        column_labels_applied = []

    applied = DataFrame(
        combined._internal.with_new_columns(
            columns_applied + columns_to_keep,
            column_labels=column_labels_applied + column_labels_to_keep,
        )
    )  # type: DataFrame

    # 3. Restore the names back and deduplicate columns.
    this_labels = OrderedDict()
    # Add columns in an order of its original frame.
    for this_label in this_column_labels:
        for new_label in applied._internal.column_labels:
            if new_label[1:] not in this_labels and this_label == new_label[1:]:
                this_labels[new_label[1:]] = new_label

    # After that, we will add the rest columns.
    other_labels = OrderedDict()
    for new_label in applied._internal.column_labels:
        if new_label[1:] not in this_labels:
            other_labels[new_label[1:]] = new_label

    psdf = applied[list(this_labels.values()) + list(other_labels.values())]
    psdf.columns = psdf.columns.droplevel()
    return psdf
예제 #20
0
파일: utils.py 프로젝트: jerqi/spark
def combine_frames(
    this: "DataFrame",
    *args: DataFrameOrSeries,
    how: str = "full",
    preserve_order_column: bool = False
) -> "DataFrame":
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from pyspark.pandas.config import get_option
    from pyspark.pandas.frame import DataFrame
    from pyspark.pandas.internal import (
        InternalField,
        InternalFrame,
        HIDDEN_COLUMNS,
        NATURAL_ORDER_COLUMN_NAME,
        SPARK_INDEX_NAME_FORMAT,
    )
    from pyspark.pandas.series import Series

    if all(isinstance(arg, Series) for arg in args):
        assert all(
            same_anchor(arg, args[0]) for arg in args
        ), "Currently only one different DataFrame (from given Series) is supported"
        assert not same_anchor(this, args[0]), "We don't need to combine. All series is in this."
        that = args[0]._psdf[list(args)]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        assert not same_anchor(
            this, args[0]
        ), "We don't need to combine. `this` and `that` are same."
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or " "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):

        def resolve(internal: InternalFrame, side: str) -> InternalFrame:
            rename = lambda col: "__{}_{}".format(side, col)
            internal = internal.resolved_copy
            sdf = internal.spark_frame
            sdf = internal.spark_frame.select(
                *[
                    scol_for(sdf, col).alias(rename(col))
                    for col in sdf.columns
                    if col not in HIDDEN_COLUMNS
                ],
                *HIDDEN_COLUMNS
            )
            return internal.copy(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, rename(col)) for col in internal.index_spark_column_names
                ],
                index_fields=[
                    field.copy(name=rename(field.name)) for field in internal.index_fields
                ],
                data_spark_columns=[
                    scol_for(sdf, rename(col)) for col in internal.data_spark_column_names
                ],
                data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields],
            )

        this_internal = resolve(this._internal, "this")
        that_internal = resolve(that._internal, "that")

        this_index_map = list(
            zip(
                this_internal.index_spark_column_names,
                this_internal.index_names,
                this_internal.index_fields,
            )
        )
        that_index_map = list(
            zip(
                that_internal.index_spark_column_names,
                that_internal.index_names,
                that_internal.index_fields,
            )
        )
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # Note that the order of each element in index_map is guaranteed according to the index
        # level.
        this_and_that_index_map = list(zip(this_index_map, that_index_map))

        this_sdf = this_internal.spark_frame.alias("this")
        that_sdf = that_internal.spark_frame.alias("that")

        # If the same named index is found, that's used.
        index_column_names = []
        index_use_extension_dtypes = []
        for (
            i,
            ((this_column, this_name, this_field), (that_column, that_name, that_field)),
        ) in enumerate(this_and_that_index_map):
            if this_name == that_name:
                # We should merge the Spark columns into one
                # to mimic pandas' behavior.
                this_scol = scol_for(this_sdf, this_column)
                that_scol = scol_for(that_sdf, that_column)
                join_scol = this_scol == that_scol
                join_scols.append(join_scol)

                column_name = SPARK_INDEX_NAME_FORMAT(i)
                index_column_names.append(column_name)
                index_use_extension_dtypes.append(
                    any(field.is_extension_dtype for field in [this_field, that_field])
                )
                merged_index_scols.append(
                    F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name)
                )
            else:
                raise ValueError("Index names must be exactly matched currently.")

        assert len(join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this_sdf.join(that_sdf, on=join_scols, how=how)

        if preserve_order_column:
            order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)]
        else:
            order_column = []

        joined_df = joined_df.select(
            *merged_index_scols,
            *(
                scol_for(this_sdf, this_internal.spark_column_name_for(label))
                for label in this_internal.column_labels
            ),
            *(
                scol_for(that_sdf, that_internal.spark_column_name_for(label))
                for label in that_internal.column_labels
            ),
            *order_column
        )

        index_spark_columns = [scol_for(joined_df, col) for col in index_column_names]

        index_columns = set(index_column_names)
        new_data_columns = [
            col
            for col in joined_df.columns
            if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME
        ]

        schema = joined_df.select(*index_spark_columns, *new_data_columns).schema

        index_fields = [
            InternalField.from_struct_field(struct_field, use_extension_dtypes=use_extension_dtypes)
            for struct_field, use_extension_dtypes in zip(
                schema.fields[: len(index_spark_columns)], index_use_extension_dtypes
            )
        ]
        data_fields = [
            InternalField.from_struct_field(
                struct_field, use_extension_dtypes=field.is_extension_dtype
            )
            for struct_field, field in zip(
                schema.fields[len(index_spark_columns) :],
                this_internal.data_fields + that_internal.data_fields,
            )
        ]

        level = max(this_internal.column_labels_level, that_internal.column_labels_level)

        def fill_label(label: Optional[Tuple]) -> List:
            if label is None:
                return ([""] * (level - 1)) + [None]
            else:
                return ([""] * (level - len(label))) + list(label)

        column_labels = [
            tuple(["this"] + fill_label(label)) for label in this_internal.column_labels
        ] + [tuple(["that"] + fill_label(label)) for label in that_internal.column_labels]
        column_label_names = (
            cast(List[Optional[Tuple]], [None]) * (1 + level - this_internal.column_labels_level)
        ) + this_internal.column_label_names
        return DataFrame(
            InternalFrame(
                spark_frame=joined_df,
                index_spark_columns=index_spark_columns,
                index_names=this_internal.index_names,
                index_fields=index_fields,
                column_labels=column_labels,
                data_spark_columns=[scol_for(joined_df, col) for col in new_data_columns],
                data_fields=data_fields,
                column_label_names=column_label_names,
            )
        )
    else:
        raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)
예제 #21
0
def sql(
    query: str,
    index_col: Optional[Union[str, List[str]]] = None,
    **kwargs: Any,
) -> DataFrame:
    """
    Execute a SQL query and return the result as a pandas-on-Spark DataFrame.

    This function acts as a standard Python string formatter with understanding
    the following variable types:

        * pandas-on-Spark DataFrame
        * pandas-on-Spark Series
        * pandas DataFrame
        * pandas Series
        * string

    Parameters
    ----------
    query : str
        the SQL query
    index_col : str or list of str, optional
        Column names to be used in Spark to represent pandas-on-Spark's index. The index name
        in pandas-on-Spark is ignored. By default, the index is always lost.

        .. note:: If you want to preserve the index, explicitly use :func:`DataFrame.reset_index`,
            and pass it to the sql statement with `index_col` parameter.

            For example,

            >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c'])
            >>> new_psdf = psdf.reset_index()
            >>> ps.sql("SELECT * FROM {new_psdf}", index_col="index", new_psdf=new_psdf)
            ... # doctest: +NORMALIZE_WHITESPACE
                   A  B
            index
            a      1  4
            b      2  5
            c      3  6

            For MultiIndex,

            >>> psdf = ps.DataFrame(
            ...     {"A": [1, 2, 3], "B": [4, 5, 6]},
            ...     index=pd.MultiIndex.from_tuples(
            ...         [("a", "b"), ("c", "d"), ("e", "f")], names=["index1", "index2"]
            ...     ),
            ... )
            >>> new_psdf = psdf.reset_index()
            >>> ps.sql(
            ...     "SELECT * FROM {new_psdf}", index_col=["index1", "index2"], new_psdf=new_psdf)
            ... # doctest: +NORMALIZE_WHITESPACE
                           A  B
            index1 index2
            a      b       1  4
            c      d       2  5
            e      f       3  6

            Also note that the index name(s) should be matched to the existing name.
    kwargs
        other variables that the user want to set that can be referenced in the query

    Returns
    -------
    pandas-on-Spark DataFrame

    Examples
    --------

    Calling a built-in SQL function.

    >>> ps.sql("SELECT * FROM range(10) where id > 7")
       id
    0   8
    1   9

    >>> ps.sql("SELECT * FROM range(10) WHERE id > {bound1} AND id < {bound2}", bound1=7, bound2=9)
       id
    0   8

    >>> mydf = ps.range(10)
    >>> x = tuple(range(4))
    >>> ps.sql("SELECT {ser} FROM {mydf} WHERE id IN {x}", ser=mydf.id, mydf=mydf, x=x)
       id
    0   0
    1   1
    2   2
    3   3

    Mixing pandas-on-Spark and pandas DataFrames in a join operation. Note that the index is
    dropped.

    >>> ps.sql('''
    ...   SELECT m1.a, m2.b
    ...   FROM {table1} m1 INNER JOIN {table2} m2
    ...   ON m1.key = m2.key
    ...   ORDER BY m1.a, m2.b''',
    ...   table1=ps.DataFrame({"a": [1,2], "key": ["a", "b"]}),
    ...   table2=pd.DataFrame({"b": [3,4,5], "key": ["a", "b", "b"]}))
       a  b
    0  1  3
    1  2  4
    2  2  5

    Also, it is possible to query using Series.

    >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c'])
    >>> ps.sql("SELECT {mydf.A} FROM {mydf}", mydf=psdf)
       A
    0  1
    1  2
    2  3
    """
    if os.environ.get("PYSPARK_PANDAS_SQL_LEGACY") == "1":
        from pyspark.pandas import sql_processor

        warnings.warn(
            "Deprecated in 3.3.0, and the legacy behavior "
            "will be removed in the future releases.",
            FutureWarning,
        )
        return sql_processor.sql(query, index_col=index_col, **kwargs)

    session = default_session()
    formatter = PandasSQLStringFormatter(session)
    try:
        sdf = session.sql(formatter.format(query, **kwargs))
    finally:
        formatter.clear()

    index_spark_columns, index_names = _get_index_map(sdf, index_col)

    return DataFrame(
        InternalFrame(spark_frame=sdf,
                      index_spark_columns=index_spark_columns,
                      index_names=index_names))
예제 #22
0
    def transform_batch(
        self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any
    ) -> DataFrameOrSeries:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a NumPy compound type style
            as below:

            >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[
            ...         (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)  # doctest: +NORMALIZE_WHITESPACE
               A  B
        index
        0      2  3
        1      4  5
        2      6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_retain_index = should_infer_schema
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
            return func(pdf).to_frame()

        def pandas_series_func(
            f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType
        ) -> "UserDefinedFunctionLike":
            ff = f

            @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
            def udf(pdf: pd.DataFrame) -> pd.Series:
                return first_series(ff(pdf))

            return udf

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            log_advice(
                "If the type hints is not specified for `transform_batch`, "
                "it is expensive to infer the data type internally."
            )
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                field = psser._internal.data_fields[0].normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                index_fields = [
                    field.normalize_spark_type() for field in psdf._internal.index_fields
                ]
                data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

                return_schema = StructType(
                    [field.struct_field for field in index_fields + data_fields]
                )

                self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(
                    psdf._internal.with_new_sdf(
                        spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
                    )
                )
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                field = InternalField(
                    dtype=cast(SeriesType, return_type).dtype,
                    struct_field=StructField(
                        name=SPARK_DEFAULT_SERIES_NAME,
                        dataType=cast(SeriesType, return_type).spark_type,
                    ),
                ).normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                index_fields = cast(DataFrameType, return_type).index_fields
                index_fields = [index_field.normalize_spark_type() for index_field in index_fields]
                data_fields = [
                    field.normalize_spark_type()
                    for field in cast(DataFrameType, return_type).data_fields
                ]
                normalized_fields = index_fields + data_fields
                return_schema = StructType([field.struct_field for field in normalized_fields])
                should_retain_index = len(index_fields) > 0

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=should_retain_index  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                index_spark_columns = None
                index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None

                if should_retain_index:
                    index_spark_columns = [
                        scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
                    ]

                    if not any(
                        [
                            SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)
                            for index_field in index_fields
                        ]
                    ):
                        index_names = [
                            (index_field.struct_field.name,) for index_field in index_fields
                        ]
                internal = InternalFrame(
                    spark_frame=sdf,
                    index_names=index_names,
                    index_spark_columns=index_spark_columns,
                    index_fields=index_fields,
                    data_fields=data_fields,
                )
                return DataFrame(internal)
예제 #23
0
    def attach_id_column(self, id_type: str, column: Name) -> "DataFrame":
        """
        Attach a column to be used as identifier of rows similar to the default index.

        See also `Default Index type
        <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        id_type : string
            The id type.

            - 'sequence' : a sequence that increases one by one.

              .. note:: this uses Spark's Window without specifying partition specification.
                  This leads to move all data into single partition in single machine and
                  could cause serious performance degradation.
                  Avoid this method against very large dataset.

            - 'distributed-sequence' : a sequence that increases one by one,
              by group-by and group-map approach in a distributed manner.
            - 'distributed' : a monotonically increasing sequence simply by using PySpark’s
              monotonically_increasing_id function in a fully distributed manner.

        column : string or tuple of string
            The column name.

        Returns
        -------
        DataFrame
            The DataFrame attached the column.

        Examples
        --------
        >>> df = ps.DataFrame({"x": ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id")
           x  id
        0  a   0
        1  b   1
        2  c   2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0)
           x  0
        0  a  0
        1  b  1
        2  c  2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0)
        ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
           x  0.0
        0  a  ...
        1  b  ...
        2  c  ...

        For multi-index columns:

        >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y"))
           x id-x
           y id-y
        0  a    0
        1  b    1
        2  c    2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0))
           x   0
           y 1.0
        0  a   0
        1  b   1
        2  c   2
        """
        from pyspark.pandas.frame import DataFrame

        if id_type == "sequence":
            attach_func = InternalFrame.attach_sequence_column
        elif id_type == "distributed-sequence":
            attach_func = InternalFrame.attach_distributed_sequence_column
        elif id_type == "distributed":
            attach_func = InternalFrame.attach_distributed_column
        else:
            raise ValueError(
                "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'"
            )

        assert is_name_like_value(column, allow_none=False), column
        if not is_name_like_tuple(column):
            column = (column,)

        internal = self._psdf._internal

        if len(column) != internal.column_labels_level:
            raise ValueError(
                "The given column `{}` must be the same length as the existing columns.".format(
                    column
                )
            )
        elif column in internal.column_labels:
            raise ValueError(
                "The given column `{}` already exists.".format(name_like_string(column))
            )

        # Make sure the underlying Spark column names are the form of
        # `name_like_string(column_label)`.
        sdf = internal.spark_frame.select(
            [
                scol.alias(SPARK_INDEX_NAME_FORMAT(i))
                for i, scol in enumerate(internal.index_spark_columns)
            ]
            + [
                scol.alias(name_like_string(label))
                for scol, label in zip(internal.data_spark_columns, internal.column_labels)
            ]
        )
        sdf = attach_func(sdf, name_like_string(column))

        return DataFrame(
            InternalFrame(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level)
                ],
                index_names=internal.index_names,
                index_fields=internal.index_fields,
                column_labels=internal.column_labels + [column],
                data_spark_columns=(
                    [scol_for(sdf, name_like_string(label)) for label in internal.column_labels]
                    + [scol_for(sdf, name_like_string(column))]
                ),
                data_fields=internal.data_fields
                + [
                    InternalField.from_struct_field(
                        StructField(name_like_string(column), LongType(), nullable=False)
                    )
                ],
                column_label_names=internal.column_label_names,
            ).resolved_copy
        )
예제 #24
0
    def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        if isinstance(right, (list, tuple)):
            from pyspark.pandas.series import first_series, scol_for
            from pyspark.pandas.frame import DataFrame
            from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField

            len_right = len(right)
            if len(left) != len(right):
                raise ValueError("Lengths must be equal")

            sdf = left._internal.spark_frame
            structed_scol = F.struct(
                sdf[NATURAL_ORDER_COLUMN_NAME],
                *left._internal.index_spark_columns,
                left.spark.column,
            )
            # The size of the list is expected to be small.
            collected_structed_scol = F.collect_list(structed_scol)
            # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order.
            collected_structed_scol = F.array_sort(collected_structed_scol)
            right_values_scol = F.array(*(F.lit(x) for x in right))
            index_scol_names = left._internal.index_spark_column_names
            scol_name = left._internal.spark_column_name_for(
                left._internal.column_labels[0])
            # Compare the values of left and right by using zip_with function.
            cond = F.zip_with(
                collected_structed_scol,
                right_values_scol,
                lambda x, y: F.struct(
                    *[
                        x[index_scol_name].alias(index_scol_name)
                        for index_scol_name in index_scol_names
                    ],
                    F.when(x[scol_name].isNull() | y.isNull(), False).
                    otherwise(x[scol_name] == y, ).alias(scol_name),
                ),
            ).alias(scol_name)
            # 1. `sdf_new` here looks like the below (the first field of each set is Index):
            # +----------------------------------------------------------+
            # |0                                                         |
            # +----------------------------------------------------------+
            # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]|
            # +----------------------------------------------------------+
            sdf_new = sdf.select(cond)
            # 2. `sdf_new` after the explode looks like the below:
            # +----------+
            # |       col|
            # +----------+
            # |{0, false}|
            # | {1, true}|
            # |{2, false}|
            # | {3, true}|
            # |{4, false}|
            # +----------+
            sdf_new = sdf_new.select(F.explode(scol_name))
            # 3. Here, the final `sdf_new` looks like the below:
            # +-----------------+-----+
            # |__index_level_0__|    0|
            # +-----------------+-----+
            # |                0|false|
            # |                1| true|
            # |                2|false|
            # |                3| true|
            # |                4|false|
            # +-----------------+-----+
            sdf_new = sdf_new.select("col.*")

            index_spark_columns = [
                scol_for(sdf_new, index_scol_name)
                for index_scol_name in index_scol_names
            ]
            data_spark_columns = [scol_for(sdf_new, scol_name)]

            internal = left._internal.copy(
                spark_frame=sdf_new,
                index_spark_columns=index_spark_columns,
                data_spark_columns=data_spark_columns,
                index_fields=[
                    InternalField.from_struct_field(index_field)
                    for index_field in sdf_new.select(
                        index_spark_columns).schema.fields
                ],
                data_fields=[
                    InternalField.from_struct_field(
                        sdf_new.select(data_spark_columns).schema.fields[0])
                ],
            )
            return first_series(DataFrame(internal))
        else:
            from pyspark.pandas.base import column_op

            return column_op(Column.__eq__)(left, right)
예제 #25
0
    def apply_batch(
        self, func: Callable[..., pd.DataFrame], args: Tuple = (), **kwds: Any
    ) -> "DataFrame":
        """
        Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas
        DataFrame given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int, [int]]:
            ...     return pd.DataFrame([len(pdf)])
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.apply_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...
            10  83
            11  83

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a NumPy compound type style
            as below:

            >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[
            ...         (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.apply: For row/columnwise operations.
        DataFrame.applymap: For elementwise operations.
        DataFrame.aggregate: Only perform aggregating type operations.
        DataFrame.transform: Only perform transforming type operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def query_func(pdf) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf.query('A == 1')
        >>> df.pandas_on_spark.apply_batch(query_func)
           c0  c1
        0   1   2

        >>> def query_func(pdf) -> ps.DataFrame[("idx", int), [("A", int), ("B", int)]]:
        ...     return pdf.query('A == 1')
        >>> df.pandas_on_spark.apply_batch(query_func)  # doctest: +NORMALIZE_WHITESPACE
             A  B
        idx
        0    1  2

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.apply_batch(lambda pdf: pdf.query('A == 1'))
           A  B
        0  1  2

        You can also specify extra arguments.

        >>> def calculation(pdf, y, z) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf ** y + z
        >>> df.pandas_on_spark.apply_batch(calculation, args=(10,), z=20)
                c0        c1
        0       21      1044
        1    59069   1048596
        2  9765645  60466196

        You can also use ``np.ufunc`` and built-in functions as input.

        >>> df.pandas_on_spark.apply_batch(np.add, args=(10,))
            A   B
        0  11  12
        1  13  14
        2  15  16

        >>> (df * -1).pandas_on_spark.apply_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        """
        # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?

        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark import pandas as ps

        if not isinstance(func, FunctionType):
            assert callable(func), "the first argument should be a callable function."
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None

        original_func = func
        func = lambda o: original_func(o, *args, **kwds)

        self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            log_advice(
                "If the type hints is not specified for `apply_batch`, "
                "it is expensive to infer the data type internally."
            )
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self_applied.head(limit + 1)._to_internal_pandas()
            applied = func(pdf)
            if not isinstance(applied, pd.DataFrame):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(applied)
                )
            psdf: DataFrame = DataFrame(applied)
            if len(pdf) <= limit:
                return psdf

            index_fields = [field.normalize_spark_type() for field in psdf._internal.index_fields]
            data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

            return_schema = StructType([field.struct_field for field in index_fields + data_fields])

            output_func = GroupBy._make_pandas_df_builder_func(
                self_applied, func, return_schema, retain_index=True
            )
            sdf = self_applied._internal.spark_frame.mapInPandas(
                lambda iterator: map(output_func, iterator), schema=return_schema
            )

            # If schema is inferred, we can restore indexes too.
            internal = psdf._internal.with_new_sdf(
                spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
            )
        else:
            return_type = infer_return_type(original_func)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe:
                raise TypeError(
                    "The given function should specify a frame as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            index_fields = cast(DataFrameType, return_type).index_fields
            should_retain_index = len(index_fields) > 0
            return_schema = cast(DataFrameType, return_type).spark_type

            output_func = GroupBy._make_pandas_df_builder_func(
                self_applied, func, return_schema, retain_index=should_retain_index
            )
            sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                lambda iterator: map(output_func, iterator), schema=return_schema
            )

            index_spark_columns = None
            index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None

            if should_retain_index:
                index_spark_columns = [
                    scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
                ]

                if not any(
                    [
                        SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)
                        for index_field in index_fields
                    ]
                ):
                    index_names = [(index_field.struct_field.name,) for index_field in index_fields]
            internal = InternalFrame(
                spark_frame=sdf,
                index_names=index_names,
                index_spark_columns=index_spark_columns,
                index_fields=index_fields,
                data_fields=cast(DataFrameType, return_type).data_fields,
            )
        return DataFrame(internal)
예제 #26
0
    def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ps.DataFrame['a': float, 'b': float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

            When the given function returns DataFrame and has the return type annotated, the
            original index of the DataFrame will be lost and then a default index will be attached
            to the result. Please be careful about configuring the default index. See also
            `Default Index Type
            <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, int]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame['A': int, 'B': int]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        names = self._psdf._internal.to_internal_spark_frame.schema.names

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = names
            return pdf

        def apply_func(pdf):
            return func(pdf).to_frame()

        def pandas_extract(pdf, name):
            # This is for output to work around a DataFrame for struct
            # from Spark 3.0.  See SPARK-23836
            return pdf[name]

        def pandas_series_func(f):
            ff = f
            return lambda *series: first_series(ff(*series))

        def pandas_frame_func(f, field_name):
            ff = f
            return lambda *series: pandas_extract(ff(pandas_concat(series)), field_name)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                spark_return_type = force_decimal_precision_scale(
                    as_nullable_spark_type(psser.spark.data_type)
                )
                return_schema = StructType(
                    [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]
                )
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)(
                    pandas_series_func(output_func)
                )
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[
                        pudf(F.struct(*columns)).alias(psser._internal.data_spark_column_names[0])
                    ],
                    data_dtypes=psser._internal.data_dtypes,
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                # Force nullability.
                return_schema = force_decimal_precision_scale(
                    as_nullable_spark_type(psdf._internal.to_internal_spark_frame.schema)
                )

                self_applied = DataFrame(self._psdf._internal.resolved_copy)  # type: DataFrame

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)(
                    output_func
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(psdf._internal.with_new_sdf(sdf))
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                spark_return_type = force_decimal_precision_scale(
                    as_nullable_spark_type(cast(SeriesType, return_type).spark_type)
                )
                return_schema = StructType(
                    [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]
                )
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)(
                    pandas_series_func(output_func)
                )
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(SPARK_DEFAULT_SERIES_NAME)],
                    data_dtypes=[cast(SeriesType, return_type).dtype],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                return_schema = cast(DataFrameType, return_type).spark_type

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)(
                    output_func
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                internal = InternalFrame(
                    spark_frame=sdf,
                    index_spark_columns=None,
                    data_dtypes=cast(DataFrameType, return_type).dtypes,
                )
                return DataFrame(internal)
예제 #27
0
    def drop(self,
             codes: List[Any],
             level: Optional[Union[int, Name]] = None) -> "MultiIndex":
        """
        Make new MultiIndex with passed list of labels deleted

        Parameters
        ----------
        codes : array-like
            Must be a list of tuples
        level : int or level name, default None

        Returns
        -------
        dropped : MultiIndex

        Examples
        --------
        >>> index = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
        >>> index # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['a']) # doctest: +SKIP
        MultiIndex([('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        internal = self._internal.resolved_copy
        sdf = internal.spark_frame
        index_scols = internal.index_spark_columns
        if level is None:
            scol = index_scols[0]
        elif isinstance(level, int):
            scol = index_scols[level]
        else:
            scol = None
            for index_spark_column, index_name in zip(
                    internal.index_spark_columns, internal.index_names):
                if not isinstance(level, tuple):
                    level = (level, )
                if level == index_name:
                    if scol is not None:
                        raise ValueError(
                            "The name {} occurs multiple times, use a level number"
                            .format(name_like_string(level)))
                    scol = index_spark_column
            if scol is None:
                raise KeyError("Level {} not found".format(
                    name_like_string(level)))
        sdf = sdf[~scol.isin(codes)]

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in internal.index_spark_column_names
            ],
            index_names=internal.index_names,
            index_fields=internal.index_fields,
            column_labels=[],
            data_spark_columns=[],
            data_fields=[],
        )
        return cast(MultiIndex, DataFrame(internal).index)
예제 #28
0
    def symmetric_difference(  # type: ignore[override]
        self,
        other: Index,
        result_name: Optional[List[Name]] = None,
        sort: Optional[bool] = None,
    ) -> "MultiIndex":
        """
        Compute the symmetric difference of two MultiIndex objects.

        Parameters
        ----------
        other : Index or array-like
        result_name : list
        sort : True or None, default None
            Whether to sort the resulting index.
            * True : Attempt to sort the result.
            * None : Do not sort the result.

        Returns
        -------
        symmetric_difference : MiltiIndex

        Notes
        -----
        ``symmetric_difference`` contains elements that appear in either
        ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by
        ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates
        dropped.

        Examples
        --------
        >>> midx1 = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                        ['speed', 'weight', 'length']],
        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        >>> midx2 = pd.MultiIndex([['pandas-on-Spark', 'cow', 'falcon'],
        ...                        ['speed', 'weight', 'length']],
        ...                       [[0, 0, 0, 1, 1, 1, 2, 2, 2],
        ...                        [0, 0, 0, 0, 1, 2, 0, 1, 2]])
        >>> s1 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
        ...                index=midx1)
        >>> s2 = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
        ...              index=midx2)

        >>> s1.index.symmetric_difference(s2.index)  # doctest: +SKIP
        MultiIndex([('pandas-on-Spark', 'speed'),
                    (  'lama', 'speed')],
                   )

        You can set names of result Index.

        >>> s1.index.symmetric_difference(s2.index, result_name=['a', 'b'])  # doctest: +SKIP
        MultiIndex([('pandas-on-Spark', 'speed'),
                    (  'lama', 'speed')],
                   names=['a', 'b'])

        You can set sort to `True`, if you want to sort the resulting index.

        >>> s1.index.symmetric_difference(s2.index, sort=True)  # doctest: +SKIP
        MultiIndex([('pandas-on-Spark', 'speed'),
                    (  'lama', 'speed')],
                   )

        You can also use the ``^`` operator:

        >>> s1.index ^ s2.index  # doctest: +SKIP
        MultiIndex([('pandas-on-Spark', 'speed'),
                    (  'lama', 'speed')],
                   )
        """
        if type(self) != type(other):
            raise NotImplementedError(
                "Doesn't support symmetric_difference between Index & MultiIndex for now"
            )

        sdf_self = self._psdf._internal.spark_frame.select(
            self._internal.index_spark_columns)
        sdf_other = other._psdf._internal.spark_frame.select(
            other._internal.index_spark_columns)

        sdf_symdiff = sdf_self.union(sdf_other).subtract(
            sdf_self.intersect(sdf_other))

        if sort:
            sdf_symdiff = sdf_symdiff.sort(*self._internal.index_spark_columns)

        internal = InternalFrame(
            spark_frame=sdf_symdiff,
            index_spark_columns=[
                scol_for(sdf_symdiff, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=self._internal.index_fields,
        )
        result = cast(MultiIndex, DataFrame(internal).index)

        if result_name:
            result.names = result_name

        return result
예제 #29
0
    def swaplevel(self, i: int = -2, j: int = -1) -> "MultiIndex":
        """
        Swap level i with level j.
        Calling this method does not change the ordering of the values.

        Parameters
        ----------
        i : int, str, default -2
            First level of index to be swapped. Can pass level name as string.
            Type of parameters can be mixed.
        j : int, str, default -1
            Second level of index to be swapped. Can pass level name as string.
            Type of parameters can be mixed.

        Returns
        -------
        MultiIndex
            A new MultiIndex.

        Examples
        --------
        >>> midx = ps.MultiIndex.from_arrays([['a', 'b'], [1, 2]], names = ['word', 'number'])
        >>> midx  # doctest: +SKIP
        MultiIndex([('a', 1),
                    ('b', 2)],
                   names=['word', 'number'])

        >>> midx.swaplevel(0, 1)  # doctest: +SKIP
        MultiIndex([(1, 'a'),
                    (2, 'b')],
                   names=['number', 'word'])

        >>> midx.swaplevel('number', 'word')  # doctest: +SKIP
        MultiIndex([(1, 'a'),
                    (2, 'b')],
                   names=['number', 'word'])
        """
        for index in (i, j):
            if not isinstance(index, int) and index not in self.names:
                raise KeyError("Level %s not found" % index)

        i = i if isinstance(i, int) else self.names.index(i)
        j = j if isinstance(j, int) else self.names.index(j)

        for index in (i, j):
            if index >= len(self.names) or index < -len(self.names):
                raise IndexError("Too many levels: Index has only %s levels, "
                                 "%s is not a valid level number" %
                                 (len(self.names), index))

        index_map = list(
            zip(
                self._internal.index_spark_columns,
                self._internal.index_names,
                self._internal.index_fields,
            ))
        index_map[i], index_map[j] = index_map[j], index_map[i]
        index_spark_columns, index_names, index_fields = zip(*index_map)
        internal = self._internal.copy(
            index_spark_columns=list(index_spark_columns),
            index_names=list(index_names),
            index_fields=list(index_fields),
            column_labels=[],
            data_spark_columns=[],
            data_fields=[],
        )
        return cast(MultiIndex, DataFrame(internal).index)
예제 #30
0
    def insert(self, loc: int, item: Any) -> Index:
        """
        Make new MultiIndex inserting new item at location.

        Follows Python list.append semantics for negative values.

        Parameters
        ----------
        loc : int
        item : object

        Returns
        -------
        new_index : MultiIndex

        Examples
        --------
        >>> psmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> psmidx.insert(3, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z'),
                    ('h', 'j')],
                   )

        For negative values

        >>> psmidx.insert(-2, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('h', 'j'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )
        """
        length = len(self)
        if loc < 0:
            loc = loc + length
            if loc < 0:
                raise IndexError(
                    "index {} is out of bounds for axis 0 with size {}".format(
                        (loc - length), length))
        else:
            if loc > length:
                raise IndexError(
                    "index {} is out of bounds for axis 0 with size {}".format(
                        loc, length))

        index_name = [
            (name, ) for name in self._internal.index_spark_column_names
        ]  # type: List[Label]
        sdf_before = self.to_frame(name=index_name)[:loc].to_spark()
        sdf_middle = Index([item]).to_frame(name=index_name).to_spark()
        sdf_after = self.to_frame(name=index_name)[loc:].to_spark()
        sdf = sdf_before.union(sdf_middle).union(sdf_after)

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=[
                InternalField(field.dtype)
                for field in self._internal.index_fields
            ],
        )
        return DataFrame(internal).index