def check_hist_plot(kser):
            bins = np.array([
                1.0, 5.9, 10.8, 15.7, 20.6, 25.5, 30.4, 35.3, 40.2, 45.1, 50.0
            ])
            data = np.array([5.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
            prev = bins[0]
            text_bins = []
            for b in bins[1:]:
                text_bins.append("[%s, %s)" % (prev, b))
                prev = b
            text_bins[-1] = text_bins[-1][:-1] + "]"
            bins = 0.5 * (bins[:-1] + bins[1:])
            name_a = name_like_string(kser.name)
            bars = [
                go.Bar(
                    x=bins,
                    y=data,
                    name=name_a,
                    text=text_bins,
                    hovertemplate=("variable=" + name_a +
                                   "<br>value=%{text}<br>count=%{y}"),
                ),
            ]
            fig = go.Figure(data=bars, layout=go.Layout(barmode="stack"))
            fig["layout"]["xaxis"]["title"] = "value"
            fig["layout"]["yaxis"]["title"] = "count"

            self.assertEqual(pprint.pformat(kser.plot(kind="hist").to_dict()),
                             pprint.pformat(fig.to_dict()))
예제 #2
0
파일: typehints.py 프로젝트: ynuosoft/spark
    def __init__(self, dtypes: List[Dtype], spark_types: List[types.DataType],
                 names: List[Optional[str]]):
        from pyspark.pandas.utils import name_like_string

        self.dtypes = dtypes
        self.spark_type = types.StructType([
            types.StructField(
                name_like_string(n) if n is not None else ("c%s" % i), t)
            for i, (n, t) in enumerate(zip(names, spark_types))
        ])  # type: types.StructType
        def check_pox_plot(kser):
            fig = go.Figure()
            fig.add_trace(
                go.Box(
                    name=name_like_string(kser.name),
                    q1=[3],
                    median=[6],
                    q3=[9],
                    mean=[10.0],
                    lowerfence=[1],
                    upperfence=[15],
                    y=[[50]],
                    boxpoints="suspectedoutliers",
                    notched=False,
                ))
            fig["layout"]["xaxis"]["title"] = name_like_string(kser.name)
            fig["layout"]["yaxis"]["title"] = "value"

            self.assertEqual(pprint.pformat(kser.plot(kind="box").to_dict()),
                             pprint.pformat(fig.to_dict()))
예제 #4
0
    def __init__(
        self, dtypes: List[Dtype], spark_types: List[types.DataType], names: List[Optional[str]]
    ):
        from pyspark.pandas.internal import InternalField
        from pyspark.pandas.utils import name_like_string

        self.fields = [
            InternalField(
                dtype=dtype,
                struct_field=types.StructField(
                    name=(name_like_string(name) if name is not None else ("c%s" % i)),
                    dataType=spark_type,
                ),
            )
            for i, (name, dtype, spark_type) in enumerate(zip(names, dtypes, spark_types))
        ]
예제 #5
0
def infer_return_type(
        f: Callable
) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=category,struct_field=StructField(index,LongType,true))]

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    spec = getfullargspec(f)
    tpe = spec.annotations.get("return", None)
    if isinstance(tpe, str):
        # This type hint can happen when given hints are string to avoid forward reference.
        tpe = resolve_string_type_hint(tpe)

    if hasattr(tpe, "__origin__") and (tpe.__origin__ == ps.DataFrame
                                       or tpe.__origin__ == ps.Series):
        # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints.
        tpe = tpe.__args__[0]

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
    # Check if the name is Tuple.
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    if name == "Tuple":
        tuple_type = tpe
        if hasattr(tuple_type, "__tuple_params__"):
            # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead.
            # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py
            parameters = getattr(tuple_type, "__tuple_params__")
        else:
            parameters = getattr(tuple_type, "__args__")

        index_parameters = [
            p for p in parameters
            if isclass(p) and issubclass(p, IndexNameTypeHolder)
        ]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(
            data_parameters) > 0, "Type hints for data must not be empty."

        index_fields = []
        if len(index_parameters) >= 1:
            for level, index_parameter in enumerate(index_parameters):
                index_name = index_parameter.name
                index_dtype, index_spark_type = pandas_on_spark_type(
                    index_parameter.tpe)
                index_fields.append(
                    InternalField(
                        dtype=index_dtype,
                        struct_field=types.StructField(
                            name=index_name if index_name is not None else
                            SPARK_INDEX_NAME_FORMAT(level),
                            dataType=index_spark_type,
                        ),
                    ))
        else:
            # No type hint for index.
            assert len(index_parameters) == 0

        data_dtypes, data_spark_types = zip(
            *(pandas_on_spark_type(p.tpe) if isclass(p)
              and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p)
              for p in data_parameters))
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
                zip(data_names, data_dtypes, data_spark_types)):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name)
                        if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                ))

        return DataFrameType(index_fields=index_fields,
                             data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)
예제 #6
0
    def compute_hist(psdf, bins):
        # 'data' is a Spark DataFrame that selects one column.
        assert isinstance(bins, (np.ndarray, np.generic))

        sdf = psdf._internal.spark_frame
        scols = []
        input_column_names = []
        for label in psdf._internal.column_labels:
            input_column_name = name_like_string(label)
            input_column_names.append(input_column_name)
            scols.append(
                psdf._internal.spark_column_for(label).alias(
                    input_column_name))
        sdf = sdf.select(*scols)

        # 1. Make the bucket output flat to:
        #     +----------+-------+
        #     |__group_id|buckets|
        #     +----------+-------+
        #     |0         |0.0    |
        #     |0         |0.0    |
        #     |0         |1.0    |
        #     |0         |2.0    |
        #     |0         |3.0    |
        #     |0         |3.0    |
        #     |1         |0.0    |
        #     |1         |1.0    |
        #     |1         |1.0    |
        #     |1         |2.0    |
        #     |1         |1.0    |
        #     |1         |0.0    |
        #     +----------+-------+
        colnames = sdf.columns
        bucket_names = ["__{}_bucket".format(colname) for colname in colnames]

        output_df = None
        for group_id, (colname,
                       bucket_name) in enumerate(zip(colnames, bucket_names)):
            # creates a Bucketizer to get corresponding bin of each value
            bucketizer = Bucketizer(splits=bins,
                                    inputCol=colname,
                                    outputCol=bucket_name,
                                    handleInvalid="skip")

            bucket_df = bucketizer.transform(sdf)

            if output_df is None:
                output_df = bucket_df.select(
                    F.lit(group_id).alias("__group_id"),
                    F.col(bucket_name).alias("__bucket"))
            else:
                output_df = output_df.union(
                    bucket_df.select(
                        F.lit(group_id).alias("__group_id"),
                        F.col(bucket_name).alias("__bucket")))

        # 2. Calculate the count based on each group and bucket.
        #     +----------+-------+------+
        #     |__group_id|buckets| count|
        #     +----------+-------+------+
        #     |0         |0.0    |2     |
        #     |0         |1.0    |1     |
        #     |0         |2.0    |1     |
        #     |0         |3.0    |2     |
        #     |1         |0.0    |2     |
        #     |1         |1.0    |3     |
        #     |1         |2.0    |1     |
        #     +----------+-------+------+
        result = (output_df.groupby("__group_id", "__bucket").agg(
            F.count("*").alias("count")).toPandas().sort_values(
                by=["__group_id", "__bucket"]))

        # 3. Fill empty bins and calculate based on each group id. From:
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |0         |0.0     |2     |
        #     |0         |1.0     |1     |
        #     |0         |2.0     |1     |
        #     |0         |3.0     |2     |
        #     +----------+--------+------+
        #     +----------+--------+------+
        #     |__group_id|__bucket| count|
        #     +----------+--------+------+
        #     |1         |0.0     |2     |
        #     |1         |1.0     |3     |
        #     |1         |2.0     |1     |
        #     +----------+--------+------+
        #
        # to:
        #     +-----------------+
        #     |__values1__bucket|
        #     +-----------------+
        #     |2                |
        #     |1                |
        #     |1                |
        #     |2                |
        #     |0                |
        #     +-----------------+
        #     +-----------------+
        #     |__values2__bucket|
        #     +-----------------+
        #     |2                |
        #     |3                |
        #     |1                |
        #     |0                |
        #     |0                |
        #     +-----------------+
        output_series = []
        for i, (input_column_name,
                bucket_name) in enumerate(zip(input_column_names,
                                              bucket_names)):
            current_bucket_result = result[result["__group_id"] == i]
            # generates a pandas DF with one row for each bin
            # we need this as some of the bins may be empty
            indexes = pd.DataFrame({"__bucket": np.arange(0, len(bins) - 1)})
            # merges the bins with counts on it and fills remaining ones with zeros
            pdf = indexes.merge(current_bucket_result,
                                how="left",
                                on=["__bucket"]).fillna(0)[["count"]]
            pdf.columns = [input_column_name]
            output_series.append(pdf[input_column_name])

        return output_series
예제 #7
0
    def attach_id_column(self, id_type: str, column: Name) -> "DataFrame":
        """
        Attach a column to be used as identifier of rows similar to the default index.

        See also `Default Index type
        <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        id_type : string
            The id type.

            - 'sequence' : a sequence that increases one by one.

              .. note:: this uses Spark's Window without specifying partition specification.
                  This leads to move all data into single partition in single machine and
                  could cause serious performance degradation.
                  Avoid this method against very large dataset.

            - 'distributed-sequence' : a sequence that increases one by one,
              by group-by and group-map approach in a distributed manner.
            - 'distributed' : a monotonically increasing sequence simply by using PySpark’s
              monotonically_increasing_id function in a fully distributed manner.

        column : string or tuple of string
            The column name.

        Returns
        -------
        DataFrame
            The DataFrame attached the column.

        Examples
        --------
        >>> df = ps.DataFrame({"x": ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id")
           x  id
        0  a   0
        1  b   1
        2  c   2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0)
           x  0
        0  a  0
        1  b  1
        2  c  2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0)
        ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
           x  0.0
        0  a  ...
        1  b  ...
        2  c  ...

        For multi-index columns:

        >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y"))
           x id-x
           y id-y
        0  a    0
        1  b    1
        2  c    2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0))
           x   0
           y 1.0
        0  a   0
        1  b   1
        2  c   2
        """
        from pyspark.pandas.frame import DataFrame

        if id_type == "sequence":
            attach_func = InternalFrame.attach_sequence_column
        elif id_type == "distributed-sequence":
            attach_func = InternalFrame.attach_distributed_sequence_column
        elif id_type == "distributed":
            attach_func = InternalFrame.attach_distributed_column
        else:
            raise ValueError(
                "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'"
            )

        assert is_name_like_value(column, allow_none=False), column
        if not is_name_like_tuple(column):
            column = (column,)

        internal = self._psdf._internal

        if len(column) != internal.column_labels_level:
            raise ValueError(
                "The given column `{}` must be the same length as the existing columns.".format(
                    column
                )
            )
        elif column in internal.column_labels:
            raise ValueError(
                "The given column `{}` already exists.".format(name_like_string(column))
            )

        # Make sure the underlying Spark column names are the form of
        # `name_like_string(column_label)`.
        sdf = internal.spark_frame.select(
            [
                scol.alias(SPARK_INDEX_NAME_FORMAT(i))
                for i, scol in enumerate(internal.index_spark_columns)
            ]
            + [
                scol.alias(name_like_string(label))
                for scol, label in zip(internal.data_spark_columns, internal.column_labels)
            ]
        )
        sdf = attach_func(sdf, name_like_string(column))

        return DataFrame(
            InternalFrame(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level)
                ],
                index_names=internal.index_names,
                index_fields=internal.index_fields,
                column_labels=internal.column_labels + [column],
                data_spark_columns=(
                    [scol_for(sdf, name_like_string(label)) for label in internal.column_labels]
                    + [scol_for(sdf, name_like_string(column))]
                ),
                data_fields=internal.data_fields
                + [
                    InternalField.from_struct_field(
                        StructField(name_like_string(column), LongType(), nullable=False)
                    )
                ],
                column_label_names=internal.column_label_names,
            ).resolved_copy
        )
예제 #8
0
    def drop(self,
             codes: List[Any],
             level: Optional[Union[int, Name]] = None) -> "MultiIndex":
        """
        Make new MultiIndex with passed list of labels deleted

        Parameters
        ----------
        codes : array-like
            Must be a list of tuples
        level : int or level name, default None

        Returns
        -------
        dropped : MultiIndex

        Examples
        --------
        >>> index = ps.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')])
        >>> index # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['a']) # doctest: +SKIP
        MultiIndex([('b', 'y'),
                    ('c', 'z')],
                   )

        >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        internal = self._internal.resolved_copy
        sdf = internal.spark_frame
        index_scols = internal.index_spark_columns
        if level is None:
            scol = index_scols[0]
        elif isinstance(level, int):
            scol = index_scols[level]
        else:
            scol = None
            for index_spark_column, index_name in zip(
                    internal.index_spark_columns, internal.index_names):
                if not isinstance(level, tuple):
                    level = (level, )
                if level == index_name:
                    if scol is not None:
                        raise ValueError(
                            "The name {} occurs multiple times, use a level number"
                            .format(name_like_string(level)))
                    scol = index_spark_column
            if scol is None:
                raise KeyError("Level {} not found".format(
                    name_like_string(level)))
        sdf = sdf[~scol.isin(codes)]

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in internal.index_spark_column_names
            ],
            index_names=internal.index_names,
            index_fields=internal.index_fields,
            column_labels=[],
            data_spark_columns=[],
            data_fields=[],
        )
        return cast(MultiIndex, DataFrame(internal).index)
예제 #9
0
    def frame(self, index_col: Optional[Union[str, List[str]]] = None) -> SparkDataFrame:
        """
        Return the current DataFrame as a Spark DataFrame.  :meth:`DataFrame.spark.frame` is an
        alias of  :meth:`DataFrame.to_spark`.

        Parameters
        ----------
        index_col: str or list of str, optional, default: None
            Column names to be used in Spark to represent pandas-on-Spark's index. The index name
            in pandas-on-Spark is ignored. By default, the index is always lost.

        See Also
        --------
        DataFrame.to_spark
        DataFrame.to_koalas
        DataFrame.spark.frame

        Examples
        --------
        By default, this method loses the index as below.

        >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
        >>> df.to_spark().show()  # doctest: +NORMALIZE_WHITESPACE
        +---+---+---+
        |  a|  b|  c|
        +---+---+---+
        |  1|  4|  7|
        |  2|  5|  8|
        |  3|  6|  9|
        +---+---+---+

        >>> df = ps.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
        >>> df.spark.frame().show()  # doctest: +NORMALIZE_WHITESPACE
        +---+---+---+
        |  a|  b|  c|
        +---+---+---+
        |  1|  4|  7|
        |  2|  5|  8|
        |  3|  6|  9|
        +---+---+---+

        If `index_col` is set, it keeps the index column as specified.

        >>> df.to_spark(index_col="index").show()  # doctest: +NORMALIZE_WHITESPACE
        +-----+---+---+---+
        |index|  a|  b|  c|
        +-----+---+---+---+
        |    0|  1|  4|  7|
        |    1|  2|  5|  8|
        |    2|  3|  6|  9|
        +-----+---+---+---+

        Keeping index column is useful when you want to call some Spark APIs and
        convert it back to pandas-on-Spark DataFrame without creating a default index, which
        can affect performance.

        >>> spark_df = df.to_spark(index_col="index")
        >>> spark_df = spark_df.filter("a == 2")
        >>> spark_df.to_koalas(index_col="index")  # doctest: +NORMALIZE_WHITESPACE
               a  b  c
        index
        1      2  5  8

        In case of multi-index, specify a list to `index_col`.

        >>> new_df = df.set_index("a", append=True)
        >>> new_spark_df = new_df.to_spark(index_col=["index_1", "index_2"])
        >>> new_spark_df.show()  # doctest: +NORMALIZE_WHITESPACE
        +-------+-------+---+---+
        |index_1|index_2|  b|  c|
        +-------+-------+---+---+
        |      0|      1|  4|  7|
        |      1|      2|  5|  8|
        |      2|      3|  6|  9|
        +-------+-------+---+---+

        Likewise, can be converted to back to pandas-on-Spark DataFrame.

        >>> new_spark_df.to_koalas(
        ...     index_col=["index_1", "index_2"])  # doctest: +NORMALIZE_WHITESPACE
                         b  c
        index_1 index_2
        0       1        4  7
        1       2        5  8
        2       3        6  9
        """
        from pyspark.pandas.utils import name_like_string

        kdf = self._kdf

        data_column_names = []
        data_columns = []
        for i, (label, spark_column, column_name) in enumerate(
            zip(
                kdf._internal.column_labels,
                kdf._internal.data_spark_columns,
                kdf._internal.data_spark_column_names,
            )
        ):
            name = str(i) if label is None else name_like_string(label)
            data_column_names.append(name)
            if column_name != name:
                spark_column = spark_column.alias(name)
            data_columns.append(spark_column)

        if index_col is None:
            return kdf._internal.spark_frame.select(data_columns)
        else:
            if isinstance(index_col, str):
                index_col = [index_col]

            old_index_scols = kdf._internal.index_spark_columns

            if len(index_col) != len(old_index_scols):
                raise ValueError(
                    "length of index columns is %s; however, the length of the given "
                    "'index_col' is %s." % (len(old_index_scols), len(index_col))
                )

            if any(col in data_column_names for col in index_col):
                raise ValueError("'index_col' cannot be overlapped with other columns.")

            new_index_scols = [
                index_scol.alias(col) for index_scol, col in zip(old_index_scols, index_col)
            ]
            return kdf._internal.spark_frame.select(new_index_scols + data_columns)
예제 #10
0
def infer_return_type(
        f: Callable
) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True)])

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True)])

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))]

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    tpe = get_type_hints(f).get("return", None)

    if tpe is None:
        raise ValueError("A return value is required for the input function")

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Tuple has _name but other types have __name__
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    # Check if the name is Tuple.
    if name == "Tuple":
        tuple_type = tpe
        parameters = getattr(tuple_type, "__args__")

        index_parameters = [
            p for p in parameters
            if isclass(p) and issubclass(p, IndexNameTypeHolder)
        ]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(
            data_parameters) > 0, "Type hints for data must not be empty."

        index_fields = []
        if len(index_parameters) >= 1:
            for level, index_parameter in enumerate(index_parameters):
                index_name = index_parameter.name
                index_dtype, index_spark_type = pandas_on_spark_type(
                    index_parameter.tpe)
                index_fields.append(
                    InternalField(
                        dtype=index_dtype,
                        struct_field=types.StructField(
                            name=index_name if index_name is not None else
                            SPARK_INDEX_NAME_FORMAT(level),
                            dataType=index_spark_type,
                        ),
                    ))
        else:
            # No type hint for index.
            assert len(index_parameters) == 0

        data_dtypes, data_spark_types = zip(
            *(pandas_on_spark_type(p.tpe) if isclass(p)
              and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p)
              for p in data_parameters))
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
                zip(data_names, data_dtypes, data_spark_types)):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name)
                        if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                ))

        return DataFrameType(index_fields=index_fields,
                             data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)