def __init__( self, sparkContext: SparkContext, sparkSession: Optional[SparkSession] = None, jsqlContext: Optional[JavaObject] = None, ): if sparkSession is None: warnings.warn( "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.", FutureWarning, ) self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm if sparkSession is None: sparkSession = SparkSession._getActiveSessionOrCreate() if jsqlContext is None: jsqlContext = sparkSession._jsparkSession.sqlContext() self.sparkSession = sparkSession self._jsqlContext = jsqlContext _monkey_patch_RDD(self.sparkSession) install_exception_handler() if (SQLContext._instantiatedContext is None or SQLContext._instantiatedContext._sc._jsc is None): SQLContext._instantiatedContext = self
def schema(self, schema: Union[StructType, str]) -> "DataStreamReader": """Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading. .. versionadded:: 2.0.0 Parameters ---------- schema : :class:`pyspark.sql.types.StructType` or str a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``). Notes ----- This API is evolving. Examples -------- >>> s = spark.readStream.schema(sdf_schema) >>> s = spark.readStream.schema("col0 INT, col1 DOUBLE") """ from pyspark.sql import SparkSession spark = SparkSession._getActiveSessionOrCreate() if isinstance(schema, StructType): jschema = spark._jsparkSession.parseDataType(schema.json()) self._jreader = self._jreader.schema(jschema) elif isinstance(schema, str): self._jreader = self._jreader.schema(schema) else: raise TypeError("schema should be StructType or string") return self
def _test() -> None: import doctest import os import tempfile from pyspark.sql import SparkSession import pyspark.sql.streaming.readwriter from py4j.protocol import Py4JError os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.streaming.readwriter.__dict__.copy() try: spark = SparkSession._getActiveSessionOrCreate() except Py4JError: # noqa: F821 spark = SparkSession(sc) # type: ignore[name-defined] # noqa: F821 globs["tempfile"] = tempfile globs["spark"] = spark globs["sdf"] = spark.readStream.format("text").load( "python/test_support/sql/streaming") globs["sdf_schema"] = StructType([StructField("data", StringType(), True)]) (failure_count, test_count) = doctest.testmod( pyspark.sql.streaming.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) globs["spark"].stop() if failure_count: sys.exit(-1)
def _create_judf(self, func: Callable[..., Any]) -> JavaObject: from pyspark.sql import SparkSession spark = SparkSession._getActiveSessionOrCreate() sc = spark.sparkContext wrapped_func = _wrap_function(sc, func, self.returnType) jdt = spark._jsparkSession.parseDataType(self.returnType.json()) judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction( # type: ignore[attr-defined] self._name, wrapped_func, jdt, self.evalType, self.deterministic) return judf
def _get_or_create(cls: Type["SQLContext"], sc: SparkContext, **static_conf: Any) -> "SQLContext": if (cls._instantiatedContext is None or SQLContext._instantiatedContext._sc._jsc is None # type: ignore[union-attr] ): assert sc._jvm is not None # There can be only one running Spark context. That will automatically # be used in the Spark session internally. session = SparkSession._getActiveSessionOrCreate(**static_conf) cls(sc, session, session._jsparkSession.sqlContext()) return cast(SQLContext, cls._instantiatedContext)
def __init__(self, sparkContext: SparkContext, jhiveContext: Optional[JavaObject] = None): warnings.warn( "HiveContext is deprecated in Spark 2.0.0. Please use " + "SparkSession.builder.enableHiveSupport().getOrCreate() instead.", FutureWarning, ) static_conf = {} if jhiveContext is None: static_conf = {"spark.sql.catalogImplementation": "in-memory"} # There can be only one running Spark context. That will automatically # be used in the Spark session internally. session = SparkSession._getActiveSessionOrCreate(**static_conf) SQLContext.__init__(self, sparkContext, session, jhiveContext)