def _start_sentry_listener(sc): # type: (Any) -> None """ Start java gateway server to add custom `SparkListener` """ from pyspark.java_gateway import ensure_callback_server_started gw = sc._gateway ensure_callback_server_started(gw) listener = SentryListener() sc._jsc.sc().addSparkListener(listener)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") from pyspark.java_gateway import ensure_callback_server_started ensure_callback_server_started(gw) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def addListener(self, listener: StreamingQueryListener) -> None: """ Register a :class:`StreamingQueryListener` to receive up-calls for life cycle events of :class:`~pyspark.sql.streaming.StreamingQuery`. .. versionadded:: 3.4.0 """ from pyspark import SparkContext from pyspark.java_gateway import ensure_callback_server_started gw = SparkContext._gateway assert gw is not None java_import(gw.jvm, "org.apache.spark.sql.streaming.*") ensure_callback_server_started(gw) self._jsqm.addListener(listener._jlistener)
def addListener(self, listener: StreamingQueryListener) -> None: """ Register a :class:`StreamingQueryListener` to receive up-calls for life cycle events of :class:`~pyspark.sql.streaming.StreamingQuery`. .. versionadded:: 3.4.0 Parameters ---------- listener : :class:`StreamingQueryListener` A :class:`StreamingQueryListener` to receive up-calls for life cycle events of :class:`~pyspark.sql.streaming.StreamingQuery`. Examples -------- >>> from pyspark.sql.streaming import StreamingQueryListener >>> class TestListener(StreamingQueryListener): ... def onQueryStarted(self, event): ... pass ... ... def onQueryProgress(self, event): ... pass ... ... def onQueryTerminated(self, event): ... pass >>> test_listener = TestListener() Register streaming query listener >>> spark.streams.addListener(test_listener) Deregister streaming query listener >>> spark.streams.removeListener(test_listener) """ from pyspark import SparkContext from pyspark.java_gateway import ensure_callback_server_started gw = SparkContext._gateway assert gw is not None java_import(gw.jvm, "org.apache.spark.sql.streaming.*") ensure_callback_server_started(gw) self._jsqm.addListener(listener._jlistener)
def foreachBatch( self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter": """ Sets the output of the streaming query to be processed using the provided function. This is supported only the in the micro-batch execution modes (that is, when the trigger is not continuous). In every micro-batch, the provided function will be called in every micro-batch with (i) the output rows as a DataFrame and (ii) the batch identifier. The batchId can be used deduplicate and transactionally write the output (that is, the provided Dataset) to external systems. The output DataFrame is guaranteed to exactly same for the same batchId (assuming all operations are deterministic in the query). .. versionadded:: 2.4.0 Notes ----- This API is evolving. Examples -------- >>> def func(batch_df, batch_id): ... batch_df.collect() ... >>> writer = sdf.writeStream.foreachBatch(func) """ from pyspark.java_gateway import ensure_callback_server_started gw = self._spark._sc._gateway assert gw is not None java_import(gw.jvm, "org.apache.spark.sql.execution.streaming.sources.*") wrapped_func = ForeachBatchFunction(self._spark, func) gw.jvm.PythonForeachBatchHelper.callForeachBatch( self._jwrite, wrapped_func) ensure_callback_server_started(gw) return self