def _test() -> None: import doctest import os from pyspark.sql import SparkSession import pyspark.sql.streaming.query from py4j.protocol import Py4JError os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.streaming.query.__dict__.copy() try: spark = SparkSession._getActiveSessionOrCreate() except Py4JError: # noqa: F821 spark = SparkSession(sc) # type: ignore[name-defined] # noqa: F821 globs["spark"] = spark (failure_count, test_count) = doctest.testmod( pyspark.sql.streaming.query, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) globs["spark"].stop() if failure_count: sys.exit(-1)
def _test() -> None: import sys import doctest import os from pyspark.sql import SparkSession import pyspark.sql.streaming.listener from py4j.protocol import Py4JError os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.streaming.listener.__dict__.copy() try: spark = SparkSession._getActiveSessionOrCreate() except Py4JError: # noqa: F821 spark = SparkSession(sc) # type: ignore[name-defined] # noqa: F821 globs["spark"] = spark (failure_count, test_count) = doctest.testmod( pyspark.sql.streaming.listener, globs=globs, ) globs["spark"].stop() if failure_count: sys.exit(-1)
def _java2py(sc: SparkContext, r: "JavaObjectOrPickleDump", encoding: str = "bytes") -> Any: if isinstance(r, JavaObject): clsName = r.getClass().getSimpleName() # convert RDD into JavaRDD if clsName != "JavaRDD" and clsName.endswith("RDD"): r = r.toJavaRDD() clsName = "JavaRDD" assert sc._jvm is not None if clsName == "JavaRDD": jrdd = sc._jvm.org.apache.spark.mllib.api.python.SerDe.javaToPython( r) return RDD(jrdd, sc) if clsName == "Dataset": return DataFrame(r, SparkSession._getActiveSessionOrCreate()) if clsName in _picklable_classes: r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r) elif isinstance(r, (JavaArray, JavaList)): try: r = sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(r) except Py4JJavaError: pass # not pickable if isinstance(r, (bytearray, bytes)): r = CPickleSerializer().loads(bytes(r), encoding=encoding) return r
def sparkSession(self): """ Returns the user-specified Spark Session or the default. """ if self._sparkSession is None: self._sparkSession = SparkSession._getActiveSessionOrCreate() return self._sparkSession
def cast(self, dataType: Union[DataType, str]) -> "Column": """ Casts the column into type ``dataType``. .. versionadded:: 1.3.0 Parameters ---------- dataType : :class:`DataType` or str a DataType or Python string literal with a DDL-formatted string to use when parsing the column to the same type. Returns ------- :class:`Column` Column representing whether each element of Column is cast into new type. Examples -------- >>> from pyspark.sql.types import StringType >>> df = spark.createDataFrame( ... [(2, "Alice"), (5, "Bob")], ["age", "name"]) >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages='2'), Row(ages='5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages='2'), Row(ages='5')] """ if isinstance(dataType, str): jc = self._jc.cast(dataType) elif isinstance(dataType, DataType): from pyspark.sql import SparkSession spark = SparkSession._getActiveSessionOrCreate() jdt = spark._jsparkSession.parseDataType(dataType.json()) jc = self._jc.cast(jdt) else: raise TypeError("unexpected type: %s" % type(dataType)) return Column(jc)
def cast(self, dataType: Union[DataType, str]) -> "Column": """ Casts the column into type ``dataType``. .. versionadded:: 1.3.0 Examples -------- >>> df.select(df.age.cast("string").alias('ages')).collect() [Row(ages='2'), Row(ages='5')] >>> df.select(df.age.cast(StringType()).alias('ages')).collect() [Row(ages='2'), Row(ages='5')] """ if isinstance(dataType, str): jc = self._jc.cast(dataType) elif isinstance(dataType, DataType): from pyspark.sql import SparkSession spark = SparkSession._getActiveSessionOrCreate() jdt = spark._jsparkSession.parseDataType(dataType.json()) jc = self._jc.cast(jdt) else: raise TypeError("unexpected type: %s" % type(dataType)) return Column(jc)