def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port jgws.resetCallbackClient(jgws.getCallbackClient().getAddress(), gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort( jgws, gw._python_proxy_port) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def __init__(self, loadDefaults=True, _jvm=None): super(SparkConf, self).__init__() self.arg = arg from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jconf = _jvm.SparkConf(loadDefaults)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort( jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init(SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def __init__(self, loadDefaults=True, _jvm=None): """ Create a new Spark configuration. @param loadDefaults: whether to load values from Java system properties (True by default) @param _jvm: internal parameter used to pass a handle to the Java VM; does not need to be set by users """ from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jconf = _jvm.SparkConf(loadDefaults)
def __init__(self, millis, _jvm=None): """ Create new Duration. @param millis: milisecond """ self._millis = millis from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jduration = _jvm.Duration(millis)
def __init__(self, loadDefaults=True, _jvm=None): """ Create a new Spark configuration. @param loadDefaults: whether to load values from Java system properties (True by default) @param _jvm: internal parameter used to pass a handle to the Java VM; does not need to be set by users """ from pyspark.context import SparkContext SparkContext._ensure_initialized() _jvm = _jvm or SparkContext._jvm self._jconf = _jvm.SparkConf(loadDefaults)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") # start callback server # getattr will fallback to JVM, so we cannot test by hasattr() if "_callback_server" not in gw.__dict__ or gw._callback_server is None: gw.callback_server_parameters.eager_load = True gw.callback_server_parameters.daemonize = True gw.callback_server_parameters.daemonize_connections = True gw.callback_server_parameters.port = 0 gw.start_callback_server(gw.callback_server_parameters) cbport = gw._callback_server.server_socket.getsockname()[1] gw._callback_server.port = cbport # gateway with real port gw._python_proxy_port = gw._callback_server.port # get the GatewayServer object in JVM by ID jgws = JavaObject("GATEWAY_SERVER", gw._gateway_client) # update the port of CallbackClient with real port gw.jvm.PythonDStream.updatePythonGatewayPort(jgws, gw._python_proxy_port) _py4j_cleaner = Py4jCallbackConnectionCleaner(gw) _py4j_cleaner.start() # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing if cls._transformerSerializer is None: transformer_serializer = TransformFunctionSerializer() transformer_serializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw) # SPARK-12511 streaming driver with checkpointing unable to finalize leading to OOM # There is an issue that Py4J's PythonProxyHandler.finalize blocks forever. # (https://github.com/bartdag/py4j/pull/184) # # Py4j will create a PythonProxyHandler in Java for "transformer_serializer" when # calling "registerSerializer". If we call "registerSerializer" twice, the second # PythonProxyHandler will override the first one, then the first one will be GCed and # trigger "PythonProxyHandler.finalize". To avoid that, we should not call # "registerSerializer" more than once, so that "PythonProxyHandler" in Java side won't # be GCed. # # TODO Once Py4J fixes this issue, we should upgrade Py4j to the latest version. transformer_serializer.gateway.jvm.PythonDStream.registerSerializer( transformer_serializer) cls._transformerSerializer = transformer_serializer else: cls._transformerSerializer.init( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def _ensure_initialized(cls): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.streaming.*") java_import(gw.jvm, "org.apache.spark.streaming.api.java.*") java_import(gw.jvm, "org.apache.spark.streaming.api.python.*") from pyspark.java_gateway import ensure_callback_server_started ensure_callback_server_started(gw) # register serializer for TransformFunction # it happens before creating SparkContext when loading from checkpointing cls._transformerSerializer = TransformFunctionSerializer( SparkContext._active_spark_context, CloudPickleSerializer(), gw)
def main(args): SparkContext._ensure_initialized() try: # Try to access HiveConf, it will raise exception if Hive is not added SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.getOrCreate() except TypeError: spark = SparkSession.builder.getOrCreate() try: spark.conf.set("spark.sql.sources.partitionOverwriteMode", args.partitionOverwriteMode) check_in_database(spark, args.src_db, args.dst_db) finally: if spark: spark.sparkContext.stop()
import atexit import os import platform import py4j import pyspark from pyspark.context import SparkContext from pyspark.sql import SparkSession, SQLContext from pyspark.storagelevel import StorageLevel if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() try: # Try to access HiveConf, it will raise exception if Hive is not added SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.getOrCreate() except TypeError: spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext atexit.register(lambda: sc.stop())
def __init__(self, sparkSession): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.sql.TiContext") self.ti = gw.jvm.TiContext(sparkSession._jsparkSession)
def __init__(self, sparkSession): SparkContext._ensure_initialized() gw = SparkContext._gateway java_import(gw.jvm, "org.apache.spark.sql.TiExtensions") self.ti = gw.jvm.TiExtensions.getInstance(sparkSession._jsparkSession).getOrCreateTiContext(sparkSession._jsparkSession)
import atexit import os import platform import py4j import pyspark from pyspark.context import SparkContext from pyspark.sql import SparkSession, SQLContext from pyspark.storagelevel import StorageLevel if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() try: # Try to access HiveConf, it will raise exception if Hive is not added SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder\ .enableHiveSupport()\ .getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.getOrCreate() except TypeError: spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext sql = spark.sql atexit.register(lambda: sc.stop())
This file is designed to be launched as a PYTHONSTARTUP script. """ import atexit import os import platform import warnings from pyspark.context import SparkContext from pyspark.sql import SparkSession if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() # type: ignore try: spark = SparkSession._create_shell_session() # type: ignore except Exception: import sys import traceback warnings.warn("Failed to initialize Spark session.") traceback.print_exc(file=sys.stderr) sys.exit(1) sc = spark.sparkContext sql = spark.sql atexit.register(lambda: sc.stop()) # for compatibility
def start(self): epochTime = int(time.mktime(time.localtime())) randomNum = random.randint(0, 10000) self.__appName = "pyspark-" + str(epochTime) + "-" + str(randomNum) value = self.__args.get('spark.home') if not value: value = "/data/software/spark-2.3.1-bin-hadoop2.7" findspark.init(spark_home=value, python_path="python3") value = self.__args.get('spark.driver.memory') if not value: value = "3g" findspark.set_driver_mem(value) value = self.__args.get('spark.executor.memory') if not value: value = "3g" findspark.set_executor_mem(value) findspark.set_app_name(self.__appName) findspark.end() import pyspark from pyspark import SparkConf from pyspark.sql import SparkSession, SQLContext from pyspark.context import SparkContext if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() pySpark = None sc_conf = SparkConf() sc_conf.set('spark.locality.wait', 30000) sc_conf.set('spark.sql.autoBroadcastJoinThreshold', -1) sc_conf.set('spark.scheduler.minRegisteredResourcesRatio', 1) value = self.__args.get('spark.executor.cores') if not value: value = '1' sc_conf.set('spark.executor.cores', int(value)) value = self.__args.get('spark.executor.instances') if not value: value = '1' sc_conf.set('spark.executor.instances', int(value)) try: # Try to access HiveConf, it will raise exception if Hive is not added SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder.enableHiveSupport().config( conf=sc_conf).getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.config(conf=sc_conf).getOrCreate() except TypeError: spark = SparkSession.builder.config(conf=sc_conf).getOrCreate() sc = spark.sparkContext return spark, sc