示例#1
0
文件: context.py 项目: 11wzy001/spark
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
示例#2
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
示例#3
0
	def __init__(self, loadDefaults=True, _jvm=None):
		super(SparkConf, self).__init__()
		self.arg = arg
		from pyspark.context import SparkContext
		SparkContext._ensure_initialized()
		_jvm = _jvm or SparkContext._jvm
		self._jconf = _jvm.SparkConf(loadDefaults)
示例#4
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(
                jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)
            _py4j_cleaner.start()

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
            transformer_serializer.init(SparkContext._active_spark_context,
                                        CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            #
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            #
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            transformer_serializer.gateway.jvm.PythonDStream.registerSerializer(
                transformer_serializer)
            cls._transformerSerializer = transformer_serializer
        else:
            cls._transformerSerializer.init(SparkContext._active_spark_context,
                                            CloudPickleSerializer(), gw)
示例#5
0
    def __init__(self, loadDefaults=True, _jvm=None):
        """
        Create a new Spark configuration.

        @param loadDefaults: whether to load values from Java system
               properties (True by default)
        @param _jvm: internal parameter used to pass a handle to the
               Java VM; does not need to be set by users
        """
        from pyspark.context import SparkContext
        SparkContext._ensure_initialized()
        _jvm = _jvm or SparkContext._jvm
        self._jconf = _jvm.SparkConf(loadDefaults)
示例#6
0
    def __init__(self, millis, _jvm=None):
        """
        Create new Duration.

        @param millis: milisecond

        """
        self._millis = millis

        from pyspark.context import SparkContext
        SparkContext._ensure_initialized()
        _jvm = _jvm or SparkContext._jvm
        self._jduration = _jvm.Duration(millis)
示例#7
0
    def __init__(self, loadDefaults=True, _jvm=None):
        """
        Create a new Spark configuration.

        @param loadDefaults: whether to load values from Java system
               properties (True by default)
        @param _jvm: internal parameter used to pass a handle to the
               Java VM; does not need to be set by users
        """
        from pyspark.context import SparkContext
        SparkContext._ensure_initialized()
        _jvm = _jvm or SparkContext._jvm
        self._jconf = _jvm.SparkConf(loadDefaults)
示例#8
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        # start callback server
        # getattr will fallback to JVM, so we cannot test by hasattr()
        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
            gw.callback_server_parameters.eager_load = True
            gw.callback_server_parameters.daemonize = True
            gw.callback_server_parameters.daemonize_connections = True
            gw.callback_server_parameters.port = 0
            gw.start_callback_server(gw.callback_server_parameters)
            cbport = gw._callback_server.server_socket.getsockname()[1]
            gw._callback_server.port = cbport
            # gateway with real port
            gw._python_proxy_port = gw._callback_server.port
            # get the GatewayServer object in JVM by ID
            jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client)
            # update the port of CallbackClient with real port
            gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port)
            _py4j_cleaner = Py4jCallbackConnectionCleaner(gw)
            _py4j_cleaner.start()

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        if cls._transformerSerializer is None:
            transformer_serializer = TransformFunctionSerializer()
            transformer_serializer.init(
                SparkContext._active_spark_context, CloudPickleSerializer(), gw)
            # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM
            # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever.
            # (https://github.com/bartdag/py4j/pull/184)
            #
            # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when
            # calling "registerSerializer". If we call "registerSerializer" twice, the second
            # PythonProxyHandler will override the first one, then the first one will be GCed and
            # trigger "PythonProxyHandler.finalize". To avoid that, we should not call
            # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't
            # be GCed.
            #
            # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version.
            transformer_serializer.gateway.jvm.PythonDStream.registerSerializer(
                transformer_serializer)
            cls._transformerSerializer = transformer_serializer
        else:
            cls._transformerSerializer.init(
                SparkContext._active_spark_context, CloudPickleSerializer(), gw)
示例#9
0
    def _ensure_initialized(cls):
        SparkContext._ensure_initialized()
        gw = SparkContext._gateway

        java_import(gw.jvm, "org.apache.spark.streaming.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.java.*")
        java_import(gw.jvm, "org.apache.spark.streaming.api.python.*")

        from pyspark.java_gateway import ensure_callback_server_started
        ensure_callback_server_started(gw)

        # register serializer for TransformFunction
        # it happens before creating SparkContext when loading from checkpointing
        cls._transformerSerializer = TransformFunctionSerializer(
            SparkContext._active_spark_context, CloudPickleSerializer(), gw)
示例#10
0
def main(args):
    SparkContext._ensure_initialized()

    try:
        # Try to access HiveConf, it will raise exception if Hive is not added
        SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
        spark = SparkSession.builder\
            .enableHiveSupport()\
            .getOrCreate()
    except py4j.protocol.Py4JError:
        spark = SparkSession.builder.getOrCreate()
    except TypeError:
        spark = SparkSession.builder.getOrCreate()

    try:
        spark.conf.set("spark.sql.sources.partitionOverwriteMode",
                       args.partitionOverwriteMode)
        check_in_database(spark, args.src_db, args.dst_db)
    finally:
        if spark:
            spark.sparkContext.stop()
示例#11
0
import atexit
import os
import platform

import py4j

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.storagelevel import StorageLevel

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
    spark = SparkSession.builder\
        .enableHiveSupport()\
        .getOrCreate()
except py4j.protocol.Py4JError:
    spark = SparkSession.builder.getOrCreate()
except TypeError:
    spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext
atexit.register(lambda: sc.stop())
示例#12
0
 def __init__(self, sparkSession):
     SparkContext._ensure_initialized()
     gw = SparkContext._gateway
     java_import(gw.jvm, "org.apache.spark.sql.TiContext")
     self.ti = gw.jvm.TiContext(sparkSession._jsparkSession)
示例#13
0
 def __init__(self, sparkSession):
     SparkContext._ensure_initialized()
     gw = SparkContext._gateway
     java_import(gw.jvm, "org.apache.spark.sql.TiExtensions")
     self.ti = gw.jvm.TiExtensions.getInstance(sparkSession._jsparkSession).getOrCreateTiContext(sparkSession._jsparkSession)
示例#14
0
文件: shell.py 项目: 1574359445/spark
import atexit
import os
import platform

import py4j

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.storagelevel import StorageLevel

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()

try:
    # Try to access HiveConf, it will raise exception if Hive is not added
    SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
    spark = SparkSession.builder\
        .enableHiveSupport()\
        .getOrCreate()
except py4j.protocol.Py4JError:
    spark = SparkSession.builder.getOrCreate()
except TypeError:
    spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext
sql = spark.sql
atexit.register(lambda: sc.stop())
示例#15
0
This file is designed to be launched as a PYTHONSTARTUP script.
"""

import atexit
import os
import platform
import warnings

from pyspark.context import SparkContext
from pyspark.sql import SparkSession

if os.environ.get("SPARK_EXECUTOR_URI"):
    SparkContext.setSystemProperty("spark.executor.uri",
                                   os.environ["SPARK_EXECUTOR_URI"])

SparkContext._ensure_initialized()  # type: ignore

try:
    spark = SparkSession._create_shell_session()  # type: ignore
except Exception:
    import sys
    import traceback
    warnings.warn("Failed to initialize Spark session.")
    traceback.print_exc(file=sys.stderr)
    sys.exit(1)

sc = spark.sparkContext
sql = spark.sql
atexit.register(lambda: sc.stop())

# for compatibility
示例#16
0
    def start(self):
        epochTime = int(time.mktime(time.localtime()))
        randomNum = random.randint(0, 10000)
        self.__appName = "pyspark-" + str(epochTime) + "-" + str(randomNum)

        value = self.__args.get('spark.home')
        if not value:
            value = "/data/software/spark-2.3.1-bin-hadoop2.7"

        findspark.init(spark_home=value, python_path="python3")

        value = self.__args.get('spark.driver.memory')
        if not value:
            value = "3g"
        findspark.set_driver_mem(value)

        value = self.__args.get('spark.executor.memory')
        if not value:
            value = "3g"
        findspark.set_executor_mem(value)

        findspark.set_app_name(self.__appName)
        findspark.end()

        import pyspark
        from pyspark import SparkConf
        from pyspark.sql import SparkSession, SQLContext

        from pyspark.context import SparkContext
        if os.environ.get("SPARK_EXECUTOR_URI"):
            SparkContext.setSystemProperty("spark.executor.uri",
                                           os.environ["SPARK_EXECUTOR_URI"])

        SparkContext._ensure_initialized()
        pySpark = None

        sc_conf = SparkConf()
        sc_conf.set('spark.locality.wait', 30000)
        sc_conf.set('spark.sql.autoBroadcastJoinThreshold', -1)
        sc_conf.set('spark.scheduler.minRegisteredResourcesRatio', 1)

        value = self.__args.get('spark.executor.cores')
        if not value:
            value = '1'
        sc_conf.set('spark.executor.cores', int(value))

        value = self.__args.get('spark.executor.instances')
        if not value:
            value = '1'
        sc_conf.set('spark.executor.instances', int(value))

        try:
            # Try to access HiveConf, it will raise exception if Hive is not added
            SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
            spark = SparkSession.builder.enableHiveSupport().config(
                conf=sc_conf).getOrCreate()

        except py4j.protocol.Py4JError:
            spark = SparkSession.builder.config(conf=sc_conf).getOrCreate()
        except TypeError:
            spark = SparkSession.builder.config(conf=sc_conf).getOrCreate()

        sc = spark.sparkContext

        return spark, sc