示例#1
0
 def aggregate_stagemetrics_DF(self, viewname="PerfStageMetrics"):
     df = self.stagemetrics.aggregateStageMetrics(viewname)
     # convert the returned Java object to a Python Dataframe
     from pyspark.sql.dataframe import DataFrame
     return DataFrame(df, self.sparksession)
示例#2
0
    def applyInPandas(self, func: "PandasGroupedMapFunction",
                      schema: Union[StructType, str]) -> DataFrame:
        """
        Maps each group of the current :class:`DataFrame` using a pandas udf and returns the result
        as a `DataFrame`.

        The function should take a `pandas.DataFrame` and return another
        `pandas.DataFrame`. For each group, all columns are passed together as a `pandas.DataFrame`
        to the user-function and the returned `pandas.DataFrame` are combined as a
        :class:`DataFrame`.

        The `schema` should be a :class:`StructType` describing the schema of the returned
        `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
        the field names in the defined schema if specified as strings, or match the
        field data types by position if not strings, e.g. integer indices.
        The length of the returned `pandas.DataFrame` can be arbitrary.

        .. versionadded:: 3.0.0

        Parameters
        ----------
        func : function
            a Python native function that takes a `pandas.DataFrame`, and outputs a
            `pandas.DataFrame`.
        schema : :class:`pyspark.sql.types.DataType` or str
            the return type of the `func` in PySpark. The value can be either a
            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        Examples
        --------
        >>> import pandas as pd  # doctest: +SKIP
        >>> from pyspark.sql.functions import pandas_udf, ceil
        >>> df = spark.createDataFrame(
        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
        ...     ("id", "v"))  # doctest: +SKIP
        >>> def normalize(pdf):
        ...     v = pdf.v
        ...     return pdf.assign(v=(v - v.mean()) / v.std())
        >>> df.groupby("id").applyInPandas(
        ...     normalize, schema="id long, v double").show()  # doctest: +SKIP
        +---+-------------------+
        | id|                  v|
        +---+-------------------+
        |  1|-0.7071067811865475|
        |  1| 0.7071067811865475|
        |  2|-0.8320502943378437|
        |  2|-0.2773500981126146|
        |  2| 1.1094003924504583|
        +---+-------------------+

        Alternatively, the user can pass a function that takes two arguments.
        In this case, the grouping key(s) will be passed as the first argument and the data will
        be passed as the second argument. The grouping key(s) will be passed as a tuple of numpy
        data types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in
        as a `pandas.DataFrame` containing all columns from the original Spark DataFrame.
        This is useful when the user does not want to hardcode grouping key(s) in the function.

        >>> df = spark.createDataFrame(
        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
        ...     ("id", "v"))  # doctest: +SKIP
        >>> def mean_func(key, pdf):
        ...     # key is a tuple of one numpy.int64, which is the value
        ...     # of 'id' for the current group
        ...     return pd.DataFrame([key + (pdf.v.mean(),)])
        >>> df.groupby('id').applyInPandas(
        ...     mean_func, schema="id long, v double").show()  # doctest: +SKIP
        +---+---+
        | id|  v|
        +---+---+
        |  1|1.5|
        |  2|6.0|
        +---+---+

        >>> def sum_func(key, pdf):
        ...     # key is a tuple of two numpy.int64s, which is the values
        ...     # of 'id' and 'ceil(df.v / 2)' for the current group
        ...     return pd.DataFrame([key + (pdf.v.sum(),)])
        >>> df.groupby(df.id, ceil(df.v / 2)).applyInPandas(
        ...     sum_func, schema="id long, `ceil(v / 2)` long, v double").show()  # doctest: +SKIP
        +---+-----------+----+
        | id|ceil(v / 2)|   v|
        +---+-----------+----+
        |  2|          5|10.0|
        |  1|          1| 3.0|
        |  2|          3| 5.0|
        |  2|          2| 3.0|
        +---+-----------+----+

        Notes
        -----
        This function requires a full shuffle. All the data of a group will be loaded
        into memory, so the user should be aware of the potential OOM risk if data is skewed
        and certain groups are too large to fit in memory.

        This API is experimental.

        See Also
        --------
        pyspark.sql.functions.pandas_udf
        """
        from pyspark.sql import GroupedData
        from pyspark.sql.functions import pandas_udf, PandasUDFType

        assert isinstance(self, GroupedData)

        udf = pandas_udf(func,
                         returnType=schema,
                         functionType=PandasUDFType.GROUPED_MAP)
        df = self._df
        udf_column = udf(*[df[col] for col in df.columns])
        jdf = self._jgd.flatMapGroupsInPandas(udf_column._jc.expr())
        return DataFrame(jdf, self.session)
示例#3
0
    def applyInPandas(self, func: "PandasCogroupedMapFunction",
                      schema: Union[StructType, str]) -> DataFrame:
        """
        Applies a function to each cogroup using pandas and returns the result
        as a `DataFrame`.

        The function should take two `pandas.DataFrame`\\s and return another
        `pandas.DataFrame`.  For each side of the cogroup, all columns are passed together as a
        `pandas.DataFrame` to the user-function and the returned `pandas.DataFrame` are combined as
        a :class:`DataFrame`.

        The `schema` should be a :class:`StructType` describing the schema of the returned
        `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
        the field names in the defined schema if specified as strings, or match the
        field data types by position if not strings, e.g. integer indices.
        The length of the returned `pandas.DataFrame` can be arbitrary.

        .. versionadded:: 3.0.0

        Parameters
        ----------
        func : function
            a Python native function that takes two `pandas.DataFrame`\\s, and
            outputs a `pandas.DataFrame`, or that takes one tuple (grouping keys) and two
            pandas ``DataFrame``\\s, and outputs a pandas ``DataFrame``.
        schema : :class:`pyspark.sql.types.DataType` or str
            the return type of the `func` in PySpark. The value can be either a
            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        Examples
        --------
        >>> from pyspark.sql.functions import pandas_udf
        >>> df1 = spark.createDataFrame(
        ...     [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
        ...     ("time", "id", "v1"))
        >>> df2 = spark.createDataFrame(
        ...     [(20000101, 1, "x"), (20000101, 2, "y")],
        ...     ("time", "id", "v2"))
        >>> def asof_join(l, r):
        ...     return pd.merge_asof(l, r, on="time", by="id")
        >>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
        ...     asof_join, schema="time int, id int, v1 double, v2 string"
        ... ).show()  # doctest: +SKIP
        +--------+---+---+---+
        |    time| id| v1| v2|
        +--------+---+---+---+
        |20000101|  1|1.0|  x|
        |20000102|  1|3.0|  x|
        |20000101|  2|2.0|  y|
        |20000102|  2|4.0|  y|
        +--------+---+---+---+

        Alternatively, the user can define a function that takes three arguments.  In this case,
        the grouping key(s) will be passed as the first argument and the data will be passed as the
        second and third arguments.  The grouping key(s) will be passed as a tuple of numpy data
        types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in as two
        `pandas.DataFrame` containing all columns from the original Spark DataFrames.

        >>> def asof_join(k, l, r):
        ...     if k == (1,):
        ...         return pd.merge_asof(l, r, on="time", by="id")
        ...     else:
        ...         return pd.DataFrame(columns=['time', 'id', 'v1', 'v2'])
        >>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
        ...     asof_join, "time int, id int, v1 double, v2 string").show()  # doctest: +SKIP
        +--------+---+---+---+
        |    time| id| v1| v2|
        +--------+---+---+---+
        |20000101|  1|1.0|  x|
        |20000102|  1|3.0|  x|
        +--------+---+---+---+

        Notes
        -----
        This function requires a full shuffle. All the data of a cogroup will be loaded
        into memory, so the user should be aware of the potential OOM risk if data is skewed
        and certain groups are too large to fit in memory.

        This API is experimental.

        See Also
        --------
        pyspark.sql.functions.pandas_udf
        """
        from pyspark.sql.pandas.functions import pandas_udf

        # The usage of the pandas_udf is internal so type checking is disabled.
        udf = pandas_udf(
            func,
            returnType=schema,
            functionType=PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF
        )  # type: ignore[call-overload]

        all_cols = self._extract_cols(self._gd1) + self._extract_cols(
            self._gd2)
        udf_column = udf(*all_cols)
        jdf = self._gd1._jgd.flatMapCoGroupsInPandas(self._gd2._jgd,
                                                     udf_column._jc.expr())
        return DataFrame(jdf, self._gd1.session)
示例#4
0
 def _df(self, jdf):
     from pyspark.sql.dataframe import DataFrame
     return DataFrame(jdf, self._spark)
示例#5
0
 def get_grid_models_metrics(self):
     return DataFrame(self._java_obj.getGridModelsMetrics(),
                      self._hc._sql_context)
示例#6
0
    def createDataFrame(self, data, schema=None, samplingRatio=None):
        """
        Creates a :class:`DataFrame` from an :class:`RDD` of :class:`tuple`/:class:`list`,
        list or :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of :class:`Row`/:class:`tuple`/:class:`list`/:class:`dict`,
            :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`StructType` or list of column names. default None.
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: :class:`DataFrame`

        >>> l = [('Alice', 1)]
        >>> sqlContext.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> sqlContext.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> sqlContext.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> sqlContext.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = sqlContext.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = sqlContext.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = sqlContext.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> sqlContext.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if has_pandas and isinstance(data, pandas.DataFrame):
            if schema is None:
                schema = list(data.columns)
            data = [r.tolist() for r in data.to_records(index=False)]

        if not isinstance(data, RDD):
            if not isinstance(data, list):
                data = list(data)
            try:
                # data could be list, tuple, generator ...
                rdd = self._sc.parallelize(data)
            except Exception:
                raise TypeError("cannot create an RDD from type: %s" %
                                type(data))
        else:
            rdd = data

        if schema is None or isinstance(schema, (list, tuple)):
            if isinstance(data, RDD):
                struct = self._inferSchema(rdd, samplingRatio)
            else:
                struct = self._inferSchemaFromList(data)
            if isinstance(schema, (list, tuple)):
                for i, name in enumerate(schema):
                    struct.fields[i].name = name
            schema = struct
            converter = _create_converter(schema)
            rdd = rdd.map(converter)

        elif isinstance(schema, StructType):
            # take the first few rows to verify schema
            rows = rdd.take(10)
            for row in rows:
                _verify_type(row, schema)

        else:
            raise TypeError("schema should be StructType or list or None")

        # convert python objects to sql data
        converter = _python_to_sql_converter(schema)
        rdd = rdd.map(converter)

        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        df = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        return DataFrame(df, self)
    def createDataFrame(self,
                        data,
                        schema=None,
                        samplingRatio=None,
                        verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [
                x.encode('utf-8') if not isinstance(x, str) else x
                for x in schema
            ]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            from pyspark.sql.utils import require_minimum_pandas_version
            require_minimum_pandas_version()

            if self._wrapped._conf.pandasRespectSessionTimeZone():
                timezone = self._wrapped._conf.sessionLocalTimeZone()
            else:
                timezone = None

            # If no schema supplied by user then get the names of columns only
            if schema is None:
                schema = [
                    str(x) if not isinstance(x, basestring) else
                    (x.encode('utf-8') if not isinstance(x, str) else x)
                    for x in data.columns
                ]

            if self._wrapped._conf.arrowEnabled() and len(data) > 0:
                try:
                    return self._create_from_pandas_with_arrow(
                        data, schema, timezone)
                except Exception as e:
                    from pyspark.util import _exception_message

                    if self._wrapped._conf.arrowFallbackEnabled():
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true; however, "
                            "failed by the reason below:\n  %s\n"
                            "Attempting non-optimization as "
                            "'spark.sql.execution.arrow.fallback.enabled' is set to "
                            "true." % _exception_message(e))
                        warnings.warn(msg)
                    else:
                        msg = (
                            "createDataFrame attempted Arrow optimization because "
                            "'spark.sql.execution.arrow.enabled' is set to true, but has reached "
                            "the error below and will not continue because automatic fallback "
                            "with 'spark.sql.execution.arrow.fallback.enabled' has been set to "
                            "false.\n  %s" % _exception_message(e))
                        warnings.warn(msg)
                        raise
            data = self._convert_from_pandas(data, schema, timezone)

        if isinstance(schema, StructType):
            verify_func = _make_type_verifier(
                schema) if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj
        elif isinstance(schema, DataType):
            dataType = schema
            schema = StructType().add("value", schema)

            verify_func = _make_type_verifier(
                dataType,
                name="field value") if verifySchema else lambda _: True

            def prepare(obj):
                verify_func(obj)
                return obj,
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema,
                                              samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(
            jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
示例#8
0
 def getLeaderboard(self, *extraColumns):
     if len(extraColumns) == 1 and isinstance(extraColumns[0], list):
         extraColumns = extraColumns[0]
     leaderboard_java = self._java_obj.getLeaderboard(extraColumns)
     return DataFrame(leaderboard_java,
                      SparkSession.builder.getOrCreate()._wrapped)
示例#9
0
文件: group_ops.py 项目: zzl0/spark
    def applyInPandas(self, func, schema):
        """
        Applies a function to each cogroup using pandas and returns the result
        as a `DataFrame`.

        The function should take two `pandas.DataFrame`\\s and return another
        `pandas.DataFrame`.  For each side of the cogroup, all columns are passed together as a
        `pandas.DataFrame` to the user-function and the returned `pandas.DataFrame` are combined as
        a :class:`DataFrame`.

        The `schema` should be a :class:`StructType` describing the schema of the returned
        `pandas.DataFrame`. The column labels of the returned `pandas.DataFrame` must either match
        the field names in the defined schema if specified as strings, or match the
        field data types by position if not strings, e.g. integer indices.
        The length of the returned `pandas.DataFrame` can be arbitrary.

        :param func: a Python native function that takes two `pandas.DataFrame`\\s, and
            outputs a `pandas.DataFrame`, or that takes one tuple (grouping keys) and two
            pandas ``DataFrame``s, and outputs a pandas ``DataFrame``.
        :param schema: the return type of the `func` in PySpark. The value can be either a
            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        >>> from pyspark.sql.functions import pandas_udf
        >>> df1 = spark.createDataFrame(
        ...     [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
        ...     ("time", "id", "v1"))
        >>> df2 = spark.createDataFrame(
        ...     [(20000101, 1, "x"), (20000101, 2, "y")],
        ...     ("time", "id", "v2"))
        >>> def asof_join(l, r):
        ...     return pd.merge_asof(l, r, on="time", by="id")
        >>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
        ...     asof_join, schema="time int, id int, v1 double, v2 string"
        ... ).show()  # doctest: +SKIP
        +--------+---+---+---+
        |    time| id| v1| v2|
        +--------+---+---+---+
        |20000101|  1|1.0|  x|
        |20000102|  1|3.0|  x|
        |20000101|  2|2.0|  y|
        |20000102|  2|4.0|  y|
        +--------+---+---+---+

        Alternatively, the user can define a function that takes three arguments.  In this case,
        the grouping key(s) will be passed as the first argument and the data will be passed as the
        second and third arguments.  The grouping key(s) will be passed as a tuple of numpy data
        types, e.g., `numpy.int32` and `numpy.float64`. The data will still be passed in as two
        `pandas.DataFrame` containing all columns from the original Spark DataFrames.

        >>> def asof_join(k, l, r):
        ...     if k == (1,):
        ...         return pd.merge_asof(l, r, on="time", by="id")
        ...     else:
        ...         return pd.DataFrame(columns=['time', 'id', 'v1', 'v2'])
        >>> df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
        ...     asof_join, "time int, id int, v1 double, v2 string").show()  # doctest: +SKIP
        +--------+---+---+---+
        |    time| id| v1| v2|
        +--------+---+---+---+
        |20000101|  1|1.0|  x|
        |20000102|  1|3.0|  x|
        +--------+---+---+---+

        .. note:: This function requires a full shuffle. All the data of a cogroup will be loaded
            into memory, so the user should be aware of the potential OOM risk if data is skewed
            and certain groups are too large to fit in memory.

        .. note:: If returning a new `pandas.DataFrame` constructed with a dictionary, it is
            recommended to explicitly index the columns by name to ensure the positions are correct,
            or alternatively use an `OrderedDict`.
            For example, `pd.DataFrame({'id': ids, 'a': data}, columns=['id', 'a'])` or
            `pd.DataFrame(OrderedDict([('id', ids), ('a', data)]))`.

        .. note:: Experimental

        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`

        """
        from pyspark.sql.pandas.functions import pandas_udf

        udf = pandas_udf(
            func,
            returnType=schema,
            functionType=PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF)
        all_cols = self._extract_cols(self._gd1) + self._extract_cols(
            self._gd2)
        udf_column = udf(*all_cols)
        jdf = self._gd1._jgd.flatMapCoGroupsInPandas(self._gd2._jgd,
                                                     udf_column._jc.expr())
        return DataFrame(jdf, self.sql_ctx)
示例#10
0
文件: session.py 项目: yaooqinn/spark
    def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
        """Returns a :class:`DataFrame` representing the result of the given query.
        When ``kwargs`` is specified, this method formats the given string by using the Python
        standard formatter.

        .. versionadded:: 2.0.0

        Parameters
        ----------
        sqlQuery : str
            SQL query string.
        kwargs : dict
            Other variables that the user wants to set that can be referenced in the query

            .. versionchanged:: 3.3.0
               Added optional argument ``kwargs`` to specify the mapping of variables in the query.
               This feature is experimental and unstable.

        Returns
        -------
        :class:`DataFrame`

        Examples
        --------
        Executing a SQL query.

        >>> spark.sql("SELECT * FROM range(10) where id > 7").show()
        +---+
        | id|
        +---+
        |  8|
        |  9|
        +---+

        Executing a SQL query with variables as Python formatter standard.

        >>> spark.sql(
        ...     "SELECT * FROM range(10) WHERE id > {bound1} AND id < {bound2}", bound1=7, bound2=9
        ... ).show()
        +---+
        | id|
        +---+
        |  8|
        +---+

        >>> mydf = spark.range(10)
        >>> spark.sql(
        ...     "SELECT {col} FROM {mydf} WHERE id IN {x}",
        ...     col=mydf.id, mydf=mydf, x=tuple(range(4))).show()
        +---+
        | id|
        +---+
        |  0|
        |  1|
        |  2|
        |  3|
        +---+

        >>> spark.sql('''
        ...   SELECT m1.a, m2.b
        ...   FROM {table1} m1 INNER JOIN {table2} m2
        ...   ON m1.key = m2.key
        ...   ORDER BY m1.a, m2.b''',
        ...   table1=spark.createDataFrame([(1, "a"), (2, "b")], ["a", "key"]),
        ...   table2=spark.createDataFrame([(3, "a"), (4, "b"), (5, "b")], ["b", "key"])).show()
        +---+---+
        |  a|  b|
        +---+---+
        |  1|  3|
        |  2|  4|
        |  2|  5|
        +---+---+

        Also, it is possible to query using class:`Column` from :class:`DataFrame`.

        >>> mydf = spark.createDataFrame([(1, 4), (2, 4), (3, 6)], ["A", "B"])
        >>> spark.sql("SELECT {df.A}, {df[B]} FROM {df}", df=mydf).show()
        +---+---+
        |  A|  B|
        +---+---+
        |  1|  4|
        |  2|  4|
        |  3|  6|
        +---+---+
        """

        formatter = SQLStringFormatter(self)
        if len(kwargs) > 0:
            sqlQuery = formatter.format(sqlQuery, **kwargs)
        try:
            return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
        finally:
            if len(kwargs) > 0:
                formatter.clear()
 def getGridModelsMetrics(self):
     jdf = self._java_obj.getGridModelsMetrics()
     sqlContext = SparkSession.builder.getOrCreate()._wrapped
     return DataFrame(jdf, sqlContext)
示例#12
0
文件: group.py 项目: zero323/spark
    def agg(self, *exprs: Union[Column, Dict[str, str]]) -> DataFrame:
        """Compute aggregates and returns the result as a :class:`DataFrame`.

        The available aggregate functions can be:

        1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`

        2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`

           .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
               a full shuffle is required. Also, all the data of a group will be loaded into
               memory, so the user should be aware of the potential OOM risk if data is skewed
               and certain groups are too large to fit in memory.

           .. seealso:: :func:`pyspark.sql.functions.pandas_udf`

        If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
        is the column to perform aggregation on, and the value is the aggregate function.

        Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.

        .. versionadded:: 1.3.0

        Parameters
        ----------
        exprs : dict
            a dict mapping from column name (string) to aggregate functions (string),
            or a list of :class:`Column`.

        Notes
        -----
        Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed
        in a single call to this function.

        Examples
        --------
        >>> gdf = df.groupBy(df.name)
        >>> sorted(gdf.agg({"*": "count"}).collect())
        [Row(name='Alice', count(1)=1), Row(name='Bob', count(1)=1)]

        >>> from pyspark.sql import functions as F
        >>> sorted(gdf.agg(F.min(df.age)).collect())
        [Row(name='Alice', min(age)=2), Row(name='Bob', min(age)=5)]

        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
        ... def min_udf(v):
        ...     return v.min()
        >>> sorted(gdf.agg(min_udf(df.age)).collect())  # doctest: +SKIP
        [Row(name='Alice', min_udf(age)=2), Row(name='Bob', min_udf(age)=5)]
        """
        assert exprs, "exprs should not be empty"
        if len(exprs) == 1 and isinstance(exprs[0], dict):
            jdf = self._jgd.agg(exprs[0])
        else:
            # Columns
            assert all(isinstance(c, Column)
                       for c in exprs), "all exprs should be Column"
            exprs = cast(Tuple[Column, ...], exprs)
            jdf = self._jgd.agg(
                exprs[0]._jc,
                _to_seq(self.session._sc, [c._jc for c in exprs[1:]]))
        return DataFrame(jdf, self.session)
示例#13
0
文件: group.py 项目: zero323/spark
 def _api(self: "GroupedData", *cols: str) -> DataFrame:
     name = f.__name__
     jdf = getattr(self._jgd, name)(_to_seq(self.session._sc, cols))
     return DataFrame(jdf, self.session)
示例#14
0
文件: group.py 项目: zero323/spark
 def _api(self: "GroupedData") -> DataFrame:
     name = f.__name__
     jdf = getattr(self._jgd, name)()
     return DataFrame(jdf, self.session)
示例#15
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.sql import SparkSession
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, SparkSession)

        from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import TimestampType
        from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type
        from pyspark.sql.pandas.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            for name, field in zip(schema, arrow_schema):
                struct.add(name,
                           from_arrow_type(field.type),
                           nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf.iloc[start:start + step]
                      for start in range(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t)
                       for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)
                       ] for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(
                jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(),
                                                   jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
示例#16
0
 def create_taskmetrics_DF(self, viewname="PerfTaskMetrics"):
     df = self.taskmetrics.createTaskMetricsDF(viewname)
     # convert the returned Java object to a Python Dataframe
     from pyspark.sql.dataframe import DataFrame
     return DataFrame(df, self.sparksession)
示例#17
0
 def _transform_recursive(self, dataset, recursive_pipeline):
     self._transfer_params_to_java()
     return DataFrame(self._java_obj.recursiveTransform(dataset._jdf, recursive_pipeline._to_java()), dataset.sql_ctx)
示例#18
0
 def leaderboard(self):
     leaderboard_java = self._java_obj.leaderboard()
     if leaderboard_java.isDefined():
         return DataFrame(leaderboard_java.get(), self._hc._sql_context)
     else:
         return None
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf[start:start + step]
                      for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [
            _create_batch(
                [(c, t)
                 for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                timezone) for pdf_slice in pdf_slices
        ]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(
                jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(),
                                          reader_func, create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(),
                                                   jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
示例#20
0
文件: group.py 项目: stels07/spark
 def _api(self):
     name = f.__name__
     jdf = getattr(self._jgd, name)()
     return DataFrame(jdf, self.sql_ctx)
示例#21
0
    def createDataFrame(self, data, schema=None, samplingRatio=None):
        """
        Creates a :class:`DataFrame` from an :class:`RDD` of :class:`tuple`/:class:`list`,
        list or :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of :class:`Row`/:class:`tuple`/:class:`list`/:class:`dict`,
            :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`StructType` or list of column names. default None.
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: :class:`DataFrame`

        >>> l = [('Alice', 1)]
        >>> sqlContext.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> sqlContext.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> sqlContext.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> sqlContext.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = sqlContext.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = sqlContext.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = sqlContext.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> sqlContext.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]]).collect())  # doctest: +SKIP
        [Row(0=1, 1=2)]
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data, schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(data, schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._ssql_ctx.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self)
        df._schema = schema
        return df
示例#22
0
文件: group.py 项目: stels07/spark
 def _api(self, *cols):
     name = f.__name__
     jdf = getattr(self._jgd, name)(_to_seq(self.sql_ctx._sc, cols))
     return DataFrame(jdf, self.sql_ctx)
示例#23
0
    def createDataFrame(self, data, schema=None, samplingRatio=None):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or
        exception will be thrown at runtime. If the given schema is not StructType, it will be
        wrapped into a StructType as its only field, and the field name will be "value", each record
        will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`DataType` or a datatype string or a list of column names, default
            is None.  The data type string format equals to `DataType.simpleString`, except that
            top level struct type can omit the `struct<>` and atomic types use `typeName()` as
            their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int`
            as a short name for IntegerType.
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: :class:`DataFrame`

        .. versionchanged:: 2.0
           The schema parameter can be a DataType or a datatype string after 2.0. If it's not a
           StructType, it will be wrapped into a StructType and each record will also be wrapped
           into a tuple.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            if schema is None:
                schema = [str(x) for x in data.columns]
            data = [r.tolist() for r in data.to_records(index=False)]

        if isinstance(schema, StructType):
            def prepare(obj):
                _verify_type(obj, schema)
                return obj
        elif isinstance(schema, DataType):
            datatype = schema

            def prepare(obj):
                _verify_type(obj, datatype)
                return (obj, )
            schema = StructType().add("value", datatype)
        else:
            prepare = lambda obj: obj

        if isinstance(data, RDD):
            rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
        else:
            rdd, schema = self._createFromLocal(map(prepare, data), schema)
        jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
        jdf = self._jsparkSession.applySchemaToPythonRDD(jrdd.rdd(), schema.json())
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
示例#24
0
 def get_grid_models_params(self):
     return DataFrame(self._java_obj.getGridModelsParams(),
                      self._hc._sql_context)