Python infer_return_type示例，databricks.koalas.typedef.infer_return_type Python示例

示例#1

0

显示文件

文件： test_typedef.py 项目： xinrong-databricks/koalas

        def try_infer_return_type():
            def f() -> 'ks.DataFrame["a" : np.float : 1, "b":str:2]':  # noqa: F821
                pass

            infer_return_type(f)

示例#2

0

显示文件

文件： test_typedef.py 项目： xinrong-databricks/koalas

        def try_infer_return_type():
            def f() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
                pass

            infer_return_type(f)

示例#3

0

显示文件

文件： test_typedef.py 项目： xinrong-databricks/koalas

        def try_infer_return_type():
            def f() -> ks.DataFrame[A]:
                pass

            infer_return_type(f)

示例#4

0

显示文件

文件： test_typedef.py 项目： prash16/koalas

        def try_infer_return_type():
            def f() -> ks.Series[pdf.a.dtype]:  # type: ignore
                pass

            infer_return_type(f)

示例#5

0

显示文件

文件： test_typedef.py 项目： prash16/koalas

    def test_infer_schema_from_pandas_instances(self):
        def func() -> pd.Series[int]:
            pass

        self.assertEqual(infer_return_type(func).tpe, LongType())

        def func() -> pd.Series[np.float]:
            pass

        self.assertEqual(infer_return_type(func).tpe, DoubleType())

        def func() -> "pd.DataFrame[np.float, str]":
            pass

        expected = StructType(
            [StructField("c0", DoubleType()),
             StructField("c1", StringType())])
        self.assertEqual(infer_return_type(func).tpe, expected)

        def func() -> "pandas.DataFrame[np.float]":
            pass

        expected = StructType([StructField("c0", DoubleType())])
        self.assertEqual(infer_return_type(func).tpe, expected)

        def func() -> "pd.Series[int]":
            pass

        self.assertEqual(infer_return_type(func).tpe, LongType())

        def func() -> pd.DataFrame[np.float, str]:
            pass

        expected = StructType(
            [StructField("c0", DoubleType()),
             StructField("c1", StringType())])
        self.assertEqual(infer_return_type(func).tpe, expected)

        def func() -> pd.DataFrame[np.float]:
            pass

        expected = StructType([StructField("c0", DoubleType())])
        self.assertEqual(infer_return_type(func).tpe, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})

        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
            pass

        expected = StructType(
            [StructField("c0", LongType()),
             StructField("c1", LongType())])
        self.assertEqual(infer_return_type(func).tpe, expected)

        pdf = pd.DataFrame({
            "a": [1, 2, 3],
            "b": pd.Categorical(["a", "b", "c"])
        })

        def func() -> pd.Series[pdf.b.dtype]:  # type: ignore
            pass

        self.assertEqual(infer_return_type(func).tpe, LongType())

        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
            pass

        expected = StructType(
            [StructField("c0", LongType()),
             StructField("c1", LongType())])
        self.assertEqual(infer_return_type(func).tpe, expected)

示例#6

0

显示文件

    def transform_batch(self, func, *args, **kwargs) -> "ks.Series":
        """
        Transform the data with the function that takes pandas Series and outputs pandas Series.
        The pandas Series given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input series. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pser) -> ks.Series[int]:
            ...     return pd.Series([len(pser)] * len(pser))
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.A.koalas.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.Series[int]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.koalas.apply_batch : Similar but it takes pandas DataFrame as its internal batch.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pser) -> ks.Series[np.int64]:
        ...     return pser + 1
        >>> df.A.koalas.transform_batch(plus_one_func)
        0    2
        1    4
        2    6
        Name: A, dtype: int64

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.A.koalas.transform_batch(lambda pser: pser + 1)
        0    2
        1    4
        2    6
        Name: A, dtype: int64

        You can also specify extra arguments.

        >>> def plus_one_func(pser, a, b, c=3) -> ks.Series[np.int64]:
        ...     return pser + a + b + c
        >>> df.A.koalas.transform_batch(plus_one_func, 1, b=2)
        0     7
        1     9
        2    11
        Name: A, dtype: int64

        You can also use ``np.ufunc`` as input.

        >>> df.A.koalas.transform_batch(np.add, 10)
        0    11
        1    13
        2    15
        Name: A, dtype: int64
        """
        from databricks import koalas as ks

        assert callable(
            func), "the first argument should be a callable function."

        return_sig = None
        try:
            spec = inspect.getfullargspec(func)
            return_sig = spec.annotations.get("return", None)
        except TypeError:
            # Falls back to schema inference if it fails to get signature.
            pass

        return_schema = None
        if return_sig is not None:
            # Extract the signature arguments from this function.
            sig_return = infer_return_type(func)
            if not isinstance(sig_return, SeriesType):
                raise ValueError(
                    "Expected the return type of this function to be of type column,"
                    " but found type {}".format(sig_return))
            return_schema = sig_return.tpe

        ff = func
        func = lambda o: ff(o, *args, **kwargs)
        return self._transform_batch(func, return_schema)

示例#7

0

显示文件

    def transform_batch(self, func, *args, **kwargs):
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ks.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.koalas.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ks.DataFrame['a': float, 'b': float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1


        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.koalas.apply_batch: For row/columnwise operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ks.DataFrame[int, int]:
        ...     return pdf + 1
        >>> df.koalas.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ks.DataFrame['A': int, 'B': int]:
        ...     return pdf + 1
        >>> df.koalas.transform_batch(plus_one_func)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> def plus_one_func(pdf) -> ks.Series[int]:
        ...     return pdf.B + 1
        >>> df.koalas.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        Name: 0, dtype: int32

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        Note that you should not transform the index. The index information will not change.

        >>> df.koalas.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.koalas.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import first_series
        from databricks import koalas as ks

        assert callable(
            func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        names = self._kdf._internal.to_internal_spark_frame.schema.names
        should_by_pass = LooseVersion(pyspark.__version__) >= "3.0"

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf = pdf.rename(columns=dict(zip(pdf.columns, names)))
            return pdf

        def pandas_extract(pdf, name):
            # This is for output to work around a DataFrame for struct
            # from Spark 3.0.  See SPARK-23836
            return pdf[name]

        def pandas_series_func(f):
            ff = f
            return lambda *series: ff(pandas_concat(series))

        def pandas_frame_func(f):
            ff = f
            return lambda *series: pandas_extract(ff(pandas_concat(series)),
                                                  field.name)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ks.get_option("compute.shortcut_limit")
            pdf = self._kdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed))
            if len(transformed) != len(pdf):
                raise ValueError(
                    "transform_batch cannot produce aggregated results")
            kdf_or_kser = ks.from_pandas(transformed)

            if isinstance(kdf_or_kser, ks.Series):
                kser = kdf_or_kser
                pudf = pandas_udf(
                    func if should_by_pass else pandas_series_func(func),
                    returnType=kser.spark.data_type,
                    functionType=PandasUDFType.SCALAR,
                )
                columns = self._kdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._kdf._internal.copy(
                    column_labels=kser._internal.column_labels,
                    data_spark_columns=[
                        (pudf(F.struct(*columns)) if should_by_pass else pudf(
                            *columns)).alias(
                                kser._internal.data_spark_column_names[0])
                    ],
                    column_label_names=kser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                kdf = kdf_or_kser
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return kdf

                return_schema = kdf._internal.to_internal_spark_frame.schema
                # Force nullability.
                return_schema = StructType([
                    StructField(field.name, field.dataType)
                    for field in return_schema.fields
                ])

                self_applied = DataFrame(self._kdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                columns = self_applied._internal.spark_columns
                if should_by_pass:
                    pudf = pandas_udf(output_func,
                                      returnType=return_schema,
                                      functionType=PandasUDFType.SCALAR)
                    temp_struct_column = verify_temp_column_name(
                        self_applied._internal.spark_frame, "__temp_struct__")
                    applied = pudf(
                        F.struct(*columns)).alias(temp_struct_column)
                    sdf = self_applied._internal.spark_frame.select(applied)
                    sdf = sdf.selectExpr("%s.*" % temp_struct_column)
                else:
                    applied = []
                    for field in return_schema.fields:
                        applied.append(
                            pandas_udf(
                                pandas_frame_func(output_func),
                                returnType=field.dataType,
                                functionType=PandasUDFType.SCALAR,
                            )(*columns).alias(field.name))
                    sdf = self_applied._internal.spark_frame.select(*applied)
                return DataFrame(kdf._internal.with_new_sdf(sdf))
        else:
            return_type = infer_return_type(original_func)
            return_schema = return_type.tpe
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig)
            if is_return_series:
                pudf = pandas_udf(
                    func if should_by_pass else pandas_series_func(func),
                    returnType=return_schema,
                    functionType=PandasUDFType.SCALAR,
                )
                columns = self._kdf._internal.spark_columns
                internal = self._kdf._internal.copy(
                    column_labels=[(SPARK_DEFAULT_SERIES_NAME, )],
                    data_spark_columns=[
                        (pudf(F.struct(*columns)) if should_by_pass else pudf(
                            *columns)).alias(SPARK_DEFAULT_SERIES_NAME)
                    ],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                self_applied = DataFrame(self._kdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                columns = self_applied._internal.spark_columns

                if should_by_pass:
                    pudf = pandas_udf(output_func,
                                      returnType=return_schema,
                                      functionType=PandasUDFType.SCALAR)
                    temp_struct_column = verify_temp_column_name(
                        self_applied._internal.spark_frame, "__temp_struct__")
                    applied = pudf(
                        F.struct(*columns)).alias(temp_struct_column)
                    sdf = self_applied._internal.spark_frame.select(applied)
                    sdf = sdf.selectExpr("%s.*" % temp_struct_column)
                else:
                    applied = []
                    for field in return_schema.fields:
                        applied.append(
                            pandas_udf(
                                pandas_frame_func(output_func),
                                returnType=field.dataType,
                                functionType=PandasUDFType.SCALAR,
                            )(*columns).alias(field.name))
                    sdf = self_applied._internal.spark_frame.select(*applied)
                return DataFrame(sdf)

示例#8

0

显示文件

    def apply_batch(self, func, args=(), **kwds):
        """
        Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas
        DataFrame given to the function is of a batch used internally.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. Koalas internally
            splits the input series into multiple batches and calls `func` with each batch multiple
            times. Therefore, operations such as global aggregations are impossible. See the example
            below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ks.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)])
            ...
            >>> df = ks.DataFrame({'A': range(1000)})
            >>> df.koalas.apply_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...
            10  83
            11  83

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ks.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ks.DataFrame["a": float, "b": float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ks.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1


        Parameters
        ----------
        func : function
            Function to apply to each pandas frame.
        args : tuple
            Positional arguments to pass to `func` in addition to the
            array/series.
        **kwds
            Additional keyword arguments to pass as keywords arguments to
            `func`.

        Returns
        -------
        DataFrame

        See Also
        --------
        DataFrame.apply: For row/columnwise operations.
        DataFrame.applymap: For elementwise operations.
        DataFrame.aggregate: Only perform aggregating type operations.
        DataFrame.transform: Only perform transforming type operations.
        Series.koalas.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ks.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def query_func(pdf) -> ks.DataFrame[int, int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           c0  c1
        0   1   2

        >>> def query_func(pdf) -> ks.DataFrame["A": int, "B": int]:
        ...     return pdf.query('A == 1')
        >>> df.koalas.apply_batch(query_func)
           A  B
        0  1  2

        You can also omit the type hints so Koalas infers the return schema as below:

        >>> df.koalas.apply_batch(lambda pdf: pdf.query('A == 1'))
           A  B
        0  1  2

        You can also specify extra arguments.

        >>> def calculation(pdf, y, z) -> ks.DataFrame[int, int]:
        ...     return pdf ** y + z
        >>> df.koalas.apply_batch(calculation, args=(10,), z=20)
                c0        c1
        0       21      1044
        1    59069   1048596
        2  9765645  60466196

        You can also use ``np.ufunc`` as input.

        >>> df.koalas.apply_batch(np.add, args=(10,))
            A   B
        0  11  12
        1  13  14
        2  15  16
        """
        # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate?

        from databricks.koalas.groupby import GroupBy
        from databricks.koalas.frame import DataFrame
        from databricks import koalas as ks

        if isinstance(func, np.ufunc):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        assert callable(
            func), "the first argument should be a callable function."

        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_use_map_in_pandas = LooseVersion(pyspark.__version__) >= "3.0"

        original_func = func
        func = lambda o: original_func(o, *args, **kwds)

        self_applied = DataFrame(self._kdf._internal.resolved_copy)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ks.get_option("compute.shortcut_limit")
            pdf = self_applied.head(limit + 1)._to_internal_pandas()
            applied = func(pdf)
            if not isinstance(applied, pd.DataFrame):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(applied))
            kdf = ks.DataFrame(applied)
            if len(pdf) <= limit:
                return kdf

            return_schema = kdf._internal.to_internal_spark_frame.schema
            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=True)

            # If schema is inferred, we can restore indexes too.
            internal = kdf._internal.with_new_sdf(sdf)
        else:
            return_type = infer_return_type(original_func)
            return_schema = return_type.tpe
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe:
                raise TypeError(
                    "The given function should specify a frame as its type "
                    "hints; however, the return type was %s." % return_sig)

            if should_use_map_in_pandas:
                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False)
                sdf = self_applied._internal.to_internal_spark_frame.mapInPandas(
                    lambda iterator: map(output_func, iterator),
                    schema=return_schema)
            else:
                sdf = GroupBy._spark_group_map_apply(
                    self_applied,
                    func, (F.spark_partition_id(), ),
                    return_schema,
                    retain_index=False)

            # Otherwise, it loses index.
            internal = InternalFrame(spark_frame=sdf, index_map=None)

        return DataFrame(internal)

示例#9

0

显示文件

文件： test_typedef.py 项目： stjordanis/koalas

    def test_infer_schema_with_names_pandas_instances(self):
        def func() -> 'pd.DataFrame["a" : np.float, "b":str]':  # noqa: F821
            pass

        expected = StructType(
            [StructField("a", DoubleType()),
             StructField("b", StringType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.unicode_])
        self.assertEqual(inferred.spark_type, expected)

        def func() -> "pd.DataFrame['a': np.float, 'b': int]":  # noqa: F821
            pass

        expected = StructType(
            [StructField("a", DoubleType()),
             StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.float64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType(
            [StructField("a", LongType()),
             StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType([
            StructField("(x, a)", LongType()),
            StructField("(y, b)", LongType())
        ])
        inferred = infer_return_type(func)
        self.assertEqual(inferred.dtypes, [np.int64, np.int64])
        self.assertEqual(inferred.spark_type, expected)

        pdf = pd.DataFrame({
            "a": [1, 2, 3],
            "b": pd.Categorical(["a", "b", "c"])
        })

        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
            pass

        expected = StructType(
            [StructField("a", LongType()),
             StructField("b", LongType())])
        inferred = infer_return_type(func)
        self.assertEqual(
            inferred.dtypes,
            [np.int64, CategoricalDtype(categories=["a", "b", "c"])])
        self.assertEqual(inferred.spark_type, expected)

示例#10

0

显示文件

        def try_infer_return_type():
            def f() -> 'pd.DataFrame["a" : np.float : 1, "b":str:2]':
                pass

            infer_return_type(f)