示例#1
0
def _start_sentry_listener(sc):
    # type: (Any) -> None
    """
    Start java gateway server to add custom `SparkListener`
    """
    from pyspark.java_gateway import ensure_callback_server_started

    gw = sc._gateway
    ensure_callback_server_started(gw)
    listener = SentryListener()
    sc._jsc.sc().addSparkListener(listener)
示例#2
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        from pyspark.java_gateway import ensure_callback_server_started
        ensure_callback_server_started(gw)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
示例#3
0
文件: query.py 项目: zero323/spark
    def addListener(self, listener: StreamingQueryListener) -> None:
        """
        Register a :class:`StreamingQueryListener` to receive up-calls for life cycle events of
        :class:`~pyspark.sql.streaming.StreamingQuery`.

        .. versionadded:: 3.4.0
        """
        from pyspark import SparkContext
        from pyspark.java_gateway import ensure_callback_server_started

        gw = SparkContext._gateway
        assert gw is not None
        java_import(gw.jvm, "org.apache.spark.sql.streaming.*")
        ensure_callback_server_started(gw)

        self._jsqm.addListener(listener._jlistener)
示例#4
0
    def addListener(self, listener: StreamingQueryListener) -> None:
        """
        Register a :class:`StreamingQueryListener` to receive up-calls for life cycle events of
        :class:`~pyspark.sql.streaming.StreamingQuery`.

        .. versionadded:: 3.4.0

        Parameters
        ----------
        listener : :class:`StreamingQueryListener`
            A :class:`StreamingQueryListener` to receive up-calls for life cycle events of
            :class:`~pyspark.sql.streaming.StreamingQuery`.

        Examples
        --------
        >>> from pyspark.sql.streaming import StreamingQueryListener
        >>> class TestListener(StreamingQueryListener):
        ...     def onQueryStarted(self, event):
        ...         pass
        ...
        ...     def onQueryProgress(self, event):
        ...         pass
        ...
        ...     def onQueryTerminated(self, event):
        ...         pass
        >>> test_listener = TestListener()

        Register streaming query listener

        >>> spark.streams.addListener(test_listener)

        Deregister streaming query listener

        >>> spark.streams.removeListener(test_listener)
        """
        from pyspark import SparkContext
        from pyspark.java_gateway import ensure_callback_server_started

        gw = SparkContext._gateway
        assert gw is not None
        java_import(gw.jvm, "org.apache.spark.sql.streaming.*")
        ensure_callback_server_started(gw)

        self._jsqm.addListener(listener._jlistener)
示例#5
0
    def foreachBatch(
            self, func: Callable[["DataFrame", int],
                                 None]) -> "DataStreamWriter":
        """
        Sets the output of the streaming query to be processed using the provided
        function. This is supported only the in the micro-batch execution modes (that is, when the
        trigger is not continuous). In every micro-batch, the provided function will be called in
        every micro-batch with (i) the output rows as a DataFrame and (ii) the batch identifier.
        The batchId can be used deduplicate and transactionally write the output
        (that is, the provided Dataset) to external systems. The output DataFrame is guaranteed
        to exactly same for the same batchId (assuming all operations are deterministic in the
        query).

        .. versionadded:: 2.4.0

        Notes
        -----
        This API is evolving.

        Examples
        --------
        >>> def func(batch_df, batch_id):
        ...     batch_df.collect()
        ...
        >>> writer = sdf.writeStream.foreachBatch(func)
        """

        from pyspark.java_gateway import ensure_callback_server_started

        gw = self._spark._sc._gateway
        assert gw is not None
        java_import(gw.jvm,
                    "org.apache.spark.sql.execution.streaming.sources.*")

        wrapped_func = ForeachBatchFunction(self._spark, func)
        gw.jvm.PythonForeachBatchHelper.callForeachBatch(
            self._jwrite, wrapped_func)
        ensure_callback_server_started(gw)
        return self