Пример #1
 def create_table_using_sql(self, ddl, provider):
     sqlcontext = SnappyContext(self.sc)
     dataDF = sqlcontext._sc.parallelize(SnappyContextTests.testdata, 5).toDF()
     sqlcontext.sql("DROP TABLE IF EXISTS " + SnappyContextTests.tablename)
Пример #2
class SnappyStreamingContext(StreamingContext):
    Main entry point for Snappy Spark Streaming functionality. A SnappyStreamingContext
    represents the connection to a Snappy cluster, and can be used to create
    L{DStream} various input sources. It can be from an existing L{SparkContext}.
    After creating and transforming DStreams, the streaming computation can
    be started and stopped using `context.start()` and `context.stop()`,
    respectively. `context.awaitTermination()` allows the current thread
    to wait for the termination of the context by `stop()` or by an exception.

    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappycontext = SnappyContext(sparkContext)
    def _ensure_initialized(cls):
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
                    SparkContext._active_spark_context, CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            cls._transformerSerializer = transformer_serializer
                    SparkContext._active_spark_context, CloudPickleSerializer(), gw)

    def _initialize_context(self, sc, duration):
        return self._jvm.JavaSnappyStreamingContext(sc._jsc, self._jduration(duration))

    def getOrCreate(cls, checkpointPath, setupFunc):
        Either recreate a SnappyStreamingContext from checkpoint data or create a new SnappyStreamingContext.
        If checkpoint data exists in the provided `checkpointPath`, then SnappyStreamingContext will be
        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
        will be used to create a new context.

        @param checkpointPath: Checkpoint directory used in an earlier streaming program
        @param setupFunc:      Function to create a new context and setup DStreams
        gw = SparkContext._gateway

        # Check whether valid checkpoint information exists in the given path
        ssc_option = gw.jvm.SnappyStreamingContextPythonHelper().tryRecoverFromCheckpoint(checkpointPath)
        if ssc_option.isEmpty():
            ssc = setupFunc()
            return ssc

        jssc = gw.jvm.JavaSnappyStreamingContext(ssc_option.get())

        # If there is already an active instance of Python SparkContext use it, or create a new one
        if not SparkContext._active_spark_context:
            jsc = jssc.sparkContext()
            conf = SparkConf(_jconf=jsc.getConf())
            SparkContext(conf=conf, gateway=gw, jsc=jsc)

        sc = SparkContext._active_spark_context

        # update ctx in serializer
        cls._transformerSerializer.ctx = sc
        return SnappyStreamingContext(sc, None, jssc)

    def getActive(cls):
        Return either the currently active SnappyStreamingContext (i.e., if there is a context started
        but not stopped) or None.
        activePythonContext = cls._activeContext
        if activePythonContext is not None:
            # Verify that the current running Java StreamingContext is active and is the same one
            # backing the supposedly active Python context
            activePythonContextJavaId = activePythonContext._jssc.ssc().hashCode()
            activeJvmContextOption = activePythonContext._jvm.SnappyStreamingContext.getActive()

            if activeJvmContextOption.isEmpty():
                cls._activeContext = None
            elif activeJvmContextOption.get().hashCode() != activePythonContextJavaId:
                cls._activeContext = None
                raise Exception("JVM's active JavaStreamingContext is not the JavaStreamingContext "
                                "backing the action Python StreamingContext. This is unexpected.")
        return cls._activeContext

    def start(self):
        Start the execution of the streams.
        SnappyStreamingContext._activeContext = self

    def sql(self ,  sqlText):
        """Returns a :class:`DataFrame` representing the result of the given query.
        :return: :class:`DataFrame`
        return self._snappycontext.sql(sqlText)

    def union(self, *dstreams):
        Create a unified DStream from multiple DStreams of the same
        type and same slide duration.
        if not dstreams:
            raise ValueError("should have at least one DStream to union")
        if len(dstreams) == 1:
            return dstreams[0]
        if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same serializer")
        if len(set(s._slideDuration for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same slide duration")
        first = dstreams[0]
        jrest = [d._jdstream for d in dstreams[1:]]
        return DStream(self._jssc.union(first._jdstream, jrest), self, first._jrdd_deserializer)

    def createSchemaDStream(self, dstream , schema):
        Creates a [[SchemaDStream]] from an DStream of Product"
        if not isinstance(schema, StructType):
            raise TypeError("schema should be StructType, but got %s" % type(schema))
        if not isinstance(dstream, DStream):
            raise TypeError("dstream should be DStream, but got %s" % type(dstream))
        return SchemaDStream(dstream._jdstream, self, dstream._jrdd_deserializer, schema)
Пример #3
 def verify_table_rows(self, rowcount):
     sqlcontext = SnappyContext(self.sc)
     result = sqlcontext.sql("SELECT COUNT(*) FROM " + SnappyContextTests.tablename).collect()
     self.assertTrue(result[0][0] == rowcount)
Пример #4
class SnappyStreamingContext(StreamingContext):
    Main entry point for Snappy Spark Streaming functionality. A SnappyStreamingContext
    represents the connection to a Snappy cluster, and can be used to create
    L{DStream} various input sources. It can be from an existing L{SparkContext}.
    After creating and transforming DStreams, the streaming computation can
    be started and stopped using `context.start()` and `context.stop()`,
    respectively. `context.awaitTermination()` allows the current thread
    to wait for the termination of the context by `stop()` or by an exception.
    def __init__(self, sparkContext, batchDuration=None, jssc=None):
        Create a new StreamingContext.

        @param sparkContext: L{SparkContext} object.
        @param batchDuration: the time interval (in seconds) at which streaming
                              data will be divided into batches

        self._sc = sparkContext
        self._jvm = self._sc._jvm
        self._jssc = jssc or self._initialize_context(self._sc, batchDuration)
        self._snappycontext = SnappyContext(sparkContext)

    def _ensure_initialized(cls):
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
                jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
                                        CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            cls._transformerSerializer = transformer_serializer
                                            CloudPickleSerializer(), gw)

    def _initialize_context(self, sc, duration):
        return self._jvm.JavaSnappyStreamingContext(sc._jsc,

    def getOrCreate(cls, checkpointPath, setupFunc):
        Either recreate a SnappyStreamingContext from checkpoint data or create a new SnappyStreamingContext.
        If checkpoint data exists in the provided `checkpointPath`, then SnappyStreamingContext will be
        recreated from the checkpoint data. If the data does not exist, then the provided setupFunc
        will be used to create a new context.

        @param checkpointPath: Checkpoint directory used in an earlier streaming program
        @param setupFunc:      Function to create a new context and setup DStreams
        gw = SparkContext._gateway

        # Check whether valid checkpoint information exists in the given path
        ssc_option = gw.jvm.SnappyStreamingContextPythonHelper(
        if ssc_option.isEmpty():
            ssc = setupFunc()
            return ssc

        jssc = gw.jvm.JavaSnappyStreamingContext(ssc_option.get())

        # If there is already an active instance of Python SparkContext use it, or create a new one
        if not SparkContext._active_spark_context:
            jsc = jssc.sparkContext()
            conf = SparkConf(_jconf=jsc.getConf())
            SparkContext(conf=conf, gateway=gw, jsc=jsc)

        sc = SparkContext._active_spark_context

        # update ctx in serializer
        cls._transformerSerializer.ctx = sc
        return SnappyStreamingContext(sc, None, jssc)

    def getActive(cls):
        Return either the currently active SnappyStreamingContext (i.e., if there is a context started
        but not stopped) or None.
        activePythonContext = cls._activeContext
        if activePythonContext is not None:
            # Verify that the current running Java StreamingContext is active and is the same one
            # backing the supposedly active Python context
            activePythonContextJavaId = activePythonContext._jssc.ssc(
            activeJvmContextOption = activePythonContext._jvm.SnappyStreamingContext.getActive(

            if activeJvmContextOption.isEmpty():
                cls._activeContext = None
            elif activeJvmContextOption.get().hashCode(
            ) != activePythonContextJavaId:
                cls._activeContext = None
                raise Exception(
                    "JVM's active JavaStreamingContext is not the JavaStreamingContext "
                    "backing the action Python StreamingContext. This is unexpected."
        return cls._activeContext

    def start(self):
        Start the execution of the streams.
        SnappyStreamingContext._activeContext = self

    def sql(self, sqlText):
        """Returns a :class:`DataFrame` representing the result of the given query.
        :return: :class:`DataFrame`
        return self._snappycontext.sql(sqlText)

    def union(self, *dstreams):
        Create a unified DStream from multiple DStreams of the same
        type and same slide duration.
        if not dstreams:
            raise ValueError("should have at least one DStream to union")
        if len(dstreams) == 1:
            return dstreams[0]
        if len(set(s._jrdd_deserializer for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same serializer")
        if len(set(s._slideDuration for s in dstreams)) > 1:
            raise ValueError("All DStreams should have same slide duration")
        first = dstreams[0]
        jrest = [d._jdstream for d in dstreams[1:]]
        return DStream(self._jssc.union(first._jdstream, jrest), self,

    def createSchemaDStream(self, dstream, schema):
        Creates a [[SchemaDStream]] from an DStream of Product"
        if not isinstance(schema, StructType):
            raise TypeError("schema should be StructType, but got %s" %
        if not isinstance(dstream, DStream):
            raise TypeError("dstream should be DStream, but got %s" %
        return SchemaDStream(dstream._jdstream, self,
                             dstream._jrdd_deserializer, schema)