Exemplo n.º 1
0
    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                 environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
                 gateway=None, jsc=None, profiler_cls=BasicProfiler):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        :param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
        :param sparkHome: Location where Spark is installed on cluster nodes.
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        :param environment: A dictionary of environment variables to set on
               worker nodes.
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
        :param serializer: The serializer for RDDs.
        :param conf: A L{SparkConf} object setting Spark properties.
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.
        :param jsc: The JavaSparkContext instance (optional).
        :param profiler_cls: A class of custom Profiler used to do profiling
               (default is pyspark.profiler.BasicProfiler).


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)
        if gateway is not None and gateway.gateway_parameters.auth_token is None:
            allow_insecure_env = os.environ.get("PYSPARK_ALLOW_INSECURE_GATEWAY", "0")
            if allow_insecure_env == "1" or allow_insecure_env.lower() == "true":
                warnings.warn(
                    "You are passing in an insecure Py4j gateway.  This "
                    "presents a security risk, and will be completely forbidden in Spark 3.0")
            else:
                raise ValueError(
                    "You are trying to pass an insecure Py4j gateway to Spark. This"
                    " presents a security risk.  If you are sure you understand and accept this"
                    " risk, you can set the environment variable"
                    " 'PYSPARK_ALLOW_INSECURE_GATEWAY=1', but"
                    " note this option will be removed in Spark 3.0")

        SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
        try:
            self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                          conf, jsc, profiler_cls)
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
            self.stop()
            raise
Exemplo n.º 2
0
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=0,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None,
                 jsc=None,
                 profiler_cls=BasicProfiler,
                 function_serializer=CloudPickleSerializer()):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        :param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
        :param sparkHome: Location where Spark is installed on cluster nodes.
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        :param environment: A dictionary of environment variables to set on
               worker nodes.
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
        :param serializer: The serializer for RDDs.
        :param conf: A L{SparkConf} object setting Spark properties.
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.
        :param jsc: The JavaSparkContext instance (optional).
        :param profiler_cls: A class of custom Profiler used to do profiling
               (default is pyspark.profiler.BasicProfiler).
        :param function_serializer: The serializer for functions used in RDD
               transformations.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)
        SparkContext._ensure_initialized(self, gateway=gateway)
        try:
            self._do_init(master, appName, sparkHome, pyFiles, environment,
                          batchSize, serializer, conf, jsc, profiler_cls,
                          function_serializer)
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
            self.stop()
            raise
Exemplo n.º 3
0
    def __init__(
        self,
        master=None,
        appName=None,
        sparkHome=None,
        pyFiles=None,
        environment=None,
        batchSize=0,
        serializer=PickleSerializer(),
        conf=None,
        gateway=None,
        jsc=None,
        profiler_cls=BasicProfiler,
    ):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        :param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
        :param sparkHome: Location where Spark is installed on cluster nodes.
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        :param environment: A dictionary of environment variables to set on
               worker nodes.
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
        :param serializer: The serializer for RDDs.
        :param conf: A L{SparkConf} object setting Spark properties.
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.
        :param jsc: The JavaSparkContext instance (optional).
        :param profiler_cls: A class of custom Profiler used to do profiling
               (default is pyspark.profiler.BasicProfiler).


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)
        SparkContext._ensure_initialized(self, gateway=gateway)
        try:
            self._do_init(
                master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls
            )
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
            self.stop()
            raise
Exemplo n.º 4
0
    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
                 environment=None, batchSize=0, serializer=PickleSerializer(), conf=None,
                 gateway=None, jsc=None, profiler_cls=BasicProfiler):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.
		创建一个新的SparkContext。要么通过这里的参数名称进行配置,要么通过C{conf}配置,
		不论哪种情况,至少需要设置master和应用程序名称。

        :param master: Cluster URL to connect to(连接到的集群URL)
               (e.g. mesos://host:port, spark://host:port, local[4]).
        :param appName: A name for your job, to display on the cluster web UI.
						作业的名称,显示在集群Web UI上
        :param sparkHome: Location where Spark is installed on cluster nodes.
						  Spark在群集节点上的安装位置
        :param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
			   要发送到群集的.zip或.py文件的集合 并添加到PYTHONPATH。
			   这些路径可以是本地文件系统或HDFS,HTTP,HTTPS或FTP URL。
        :param environment: A dictionary of environment variables to set on
               worker nodes.
			   要在工作节点上设置的环境变量字典。
        :param batchSize: The number of Python objects represented as a single
               Java object. Set 1 to disable batching, 0 to automatically choose
               the batch size based on object sizes, or -1 to use an unlimited
               batch size
			   表示为单个Java对象所需要的Python对象数量。设置1禁用批处理,0根据对象大小自动选择块大小,或-1不限制块大小
        :param serializer: The serializer for RDDs.
							RDD的序列化对象
        :param conf: A L{SparkConf} object setting Spark properties.
					一个用于设置Spark属性的{SparkConf}对象
        :param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instantiated.
					使用现有的网关和JVM,否则一个新的JVM将被实例化。
        :param jsc: The JavaSparkContext instance (optional).
					JavaSparkContext实例(可选)
        :param profiler_cls: A class of custom Profiler used to do profiling
               (default is pyspark.profiler.BasicProfiler).
			   用于分析的一个自定义分析器(默认是pyspark.profiler.BasicProfiler)。


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        self._callsite = first_spark_call() or CallSite(None, None, None)#检查异常信息
		#检查SparkContext是否被初始化,如果SparkContext已经在运行,则会引发异常
        SparkContext._ensure_initialized(self, gateway=gateway, conf=conf) 
        try:
            self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                          conf, jsc, profiler_cls)
        except:
            # If an error occurs, clean up in order to allow future SparkContext creation:
			#如果出现了错误,为了以后的SparkContext创建可以创建需要做相关清理
            self.stop()
            raise