Пример #1
0
    def __init__(
        self,
        sparkContext: SparkContext,
        sparkSession: Optional[SparkSession] = None,
        jsqlContext: Optional[JavaObject] = None,
    ):
        if sparkSession is None:
            warnings.warn(
                "Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.",
                FutureWarning,
            )

        self._sc = sparkContext
        self._jsc = self._sc._jsc
        self._jvm = self._sc._jvm
        if sparkSession is None:
            sparkSession = SparkSession._getActiveSessionOrCreate()
        if jsqlContext is None:
            jsqlContext = sparkSession._jsparkSession.sqlContext()
        self.sparkSession = sparkSession
        self._jsqlContext = jsqlContext
        _monkey_patch_RDD(self.sparkSession)
        install_exception_handler()
        if (SQLContext._instantiatedContext is None
                or SQLContext._instantiatedContext._sc._jsc is None):
            SQLContext._instantiatedContext = self
Пример #2
0
    def schema(self, schema: Union[StructType, str]) -> "DataStreamReader":
        """Specifies the input schema.

        Some data sources (e.g. JSON) can infer the input schema automatically from data.
        By specifying the schema here, the underlying data source can skip the schema
        inference step, and thus speed up data loading.

        .. versionadded:: 2.0.0

        Parameters
        ----------
        schema : :class:`pyspark.sql.types.StructType` or str
            a :class:`pyspark.sql.types.StructType` object or a DDL-formatted string
            (For example ``col0 INT, col1 DOUBLE``).

        Notes
        -----
        This API is evolving.

        Examples
        --------
        >>> s = spark.readStream.schema(sdf_schema)
        >>> s = spark.readStream.schema("col0 INT, col1 DOUBLE")
        """
        from pyspark.sql import SparkSession

        spark = SparkSession._getActiveSessionOrCreate()
        if isinstance(schema, StructType):
            jschema = spark._jsparkSession.parseDataType(schema.json())
            self._jreader = self._jreader.schema(jschema)
        elif isinstance(schema, str):
            self._jreader = self._jreader.schema(schema)
        else:
            raise TypeError("schema should be StructType or string")
        return self
Пример #3
0
def _test() -> None:
    import doctest
    import os
    import tempfile
    from pyspark.sql import SparkSession
    import pyspark.sql.streaming.readwriter
    from py4j.protocol import Py4JError

    os.chdir(os.environ["SPARK_HOME"])

    globs = pyspark.sql.streaming.readwriter.__dict__.copy()
    try:
        spark = SparkSession._getActiveSessionOrCreate()
    except Py4JError:  # noqa: F821
        spark = SparkSession(sc)  # type: ignore[name-defined] # noqa: F821

    globs["tempfile"] = tempfile
    globs["spark"] = spark
    globs["sdf"] = spark.readStream.format("text").load(
        "python/test_support/sql/streaming")
    globs["sdf_schema"] = StructType([StructField("data", StringType(), True)])

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.streaming.readwriter,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
        | doctest.REPORT_NDIFF,
    )
    globs["spark"].stop()

    if failure_count:
        sys.exit(-1)
Пример #4
0
    def _create_judf(self, func: Callable[..., Any]) -> JavaObject:
        from pyspark.sql import SparkSession

        spark = SparkSession._getActiveSessionOrCreate()
        sc = spark.sparkContext

        wrapped_func = _wrap_function(sc, func, self.returnType)
        jdt = spark._jsparkSession.parseDataType(self.returnType.json())
        judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(  # type: ignore[attr-defined]
            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
        return judf
Пример #5
0
    def _get_or_create(cls: Type["SQLContext"], sc: SparkContext,
                       **static_conf: Any) -> "SQLContext":

        if (cls._instantiatedContext is None
                or SQLContext._instantiatedContext._sc._jsc is
                None  # type: ignore[union-attr]
            ):
            assert sc._jvm is not None
            # There can be only one running Spark context. That will automatically
            # be used in the Spark session internally.
            session = SparkSession._getActiveSessionOrCreate(**static_conf)
            cls(sc, session, session._jsparkSession.sqlContext())
        return cast(SQLContext, cls._instantiatedContext)
Пример #6
0
 def __init__(self,
              sparkContext: SparkContext,
              jhiveContext: Optional[JavaObject] = None):
     warnings.warn(
         "HiveContext is deprecated in Spark 2.0.0. Please use " +
         "SparkSession.builder.enableHiveSupport().getOrCreate() instead.",
         FutureWarning,
     )
     static_conf = {}
     if jhiveContext is None:
         static_conf = {"spark.sql.catalogImplementation": "in-memory"}
     # There can be only one running Spark context. That will automatically
     # be used in the Spark session internally.
     session = SparkSession._getActiveSessionOrCreate(**static_conf)
     SQLContext.__init__(self, sparkContext, session, jhiveContext)