示例#1
0
    def subQuery(ct):
        serialized = example.Serialized()
        example.ReadSerializationFromFile(SparkFiles.get('cryptocontext.cc'), serialized)
        cryptoContext = example.CryptoContextFactory.DeserializeAndCreateContext(serialized, False)

        example.ReadSerializationFromFile(SparkFiles.get('query.enc'), serialized)
        queryCT = cryptoContext.deserializeCiphertext(serialized)

        example.StringToSerialization(ct, serialized)
        dfCT = cryptoContext.deserializeCiphertext(serialized)
        
        result = cryptoContext.EvalSub(dfCT, queryCT)

        """
        # Verify that one of them subs to 0
        example.ReadSerializationFromFile(SparkFiles.get('key.sec'), serialized)
        secKey = cryptoContext.deserializeSecretKey(serialized)
        plaintextDec = example.IntPlaintextEncoding([])
        decryptResult = cryptoContext.Decrypt(secKey, [result], plaintextDec, True)
        plaintextDec = example.decodeInts(plaintextDec)
        plaintextDec = plaintextDec[:decryptResult.messageLength]
        print(all(v == 0 for v in plaintextDec))
        """

        result.Serialize(serialized)
        return example.SerializationToString(serialized, '')[1]
示例#2
0
    def remove_spark_temp_files(self):
        logger.debug(
            f'Cleaning spark master temp dir {SparkFiles.getRootDirectory()}')
        rmtree(SparkFiles.getRootDirectory(), ignore_errors=True)

        temp_dir_rdd = self._spark_context.parallelize(
            range(self._spark_context.defaultParallelism)).map(
                lambda args: SparkFiles.getRootDirectory())
        logger.debug(
            f'Cleaning spark workers temp dirs: {set(temp_dir_rdd.collect())}')
        (temp_dir_rdd.map(
            lambda path: rmtree(path, ignore_errors=True)).collect())
示例#3
0
    def get_or_extract(archive_path):
        """Given a path returned by add_local_model(), this method will return a tuple of
        (loaded_model, local_model_path).
        If this Python process ever loaded the model before, we will reuse that copy.
        """
        from pyspark.files import SparkFiles
        import zipfile

        if archive_path in _SparkDirectoryDistributor._extracted_dir_paths:
            return _SparkDirectoryDistributor._extracted_dir_paths[
                archive_path]

        # BUG: Despite the documentation of SparkContext.addFile() and SparkFiles.get() in Scala
        # and Python, it turns out that we actually need to use the basename as the input to
        # SparkFiles.get(), as opposed to the (absolute) path.
        archive_path_basename = os.path.basename(archive_path)
        local_path = SparkFiles.get(archive_path_basename)
        temp_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(local_path, "r")
        zip_ref.extractall(temp_dir)
        zip_ref.close()

        _SparkDirectoryDistributor._extracted_dir_paths[
            archive_path] = temp_dir
        return _SparkDirectoryDistributor._extracted_dir_paths[archive_path]
示例#4
0
 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEquals("Hello World!\n", test_file.readline())
示例#5
0
 def test_add_file_locally(self):
     path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
     self.sc.addFile(path)
     download_path = SparkFiles.get("hello.txt")
     self.assertNotEqual(path, download_path)
     with open(download_path) as test_file:
         self.assertEquals("Hello World!\n", test_file.readline())
示例#6
0
    def mult(ct1, ct2):
        serialized = example.Serialized()
        example.ReadSerializationFromFile(SparkFiles.get('cryptocontext.cc'), serialized)
        cryptoContext = example.CryptoContextFactory.DeserializeAndCreateContext(serialized, False)

        ct1 = ct1.value
        example.StringToSerialization(ct1, serialized)
        ct1 = cryptoContext.deserializeCiphertext(serialized)

        ct2 = ct2.value
        example.StringToSerialization(ct2, serialized)
        ct2 = cryptoContext.deserializeCiphertext(serialized)

        result = cryptoContext.EvalMult(ct1, ct2)

        """
        # Verify that one of them mults to 0
        example.ReadSerializationFromFile(SparkFiles.get('key.sec'), serialized)
        secKey = cryptoContext.deserializeSecretKey(serialized)
        plaintextDec = example.IntPlaintextEncoding([])
        decryptResult = cryptoContext.Decrypt(secKey, [result], plaintextDec, True)
        plaintextDec = example.decodeInts(plaintextDec)
        plaintextDec = plaintextDec[:decryptResult.messageLength]
        print(all(v == 0 for v in plaintextDec))
        """

        result.Serialize(serialized)
        return Row(value=example.SerializationToString(serialized, '')[1])
示例#7
0
文件: predict.py 项目: zhenglm/dlflow
        def predict_map(rdd):
            from pyspark.files import SparkFiles

            config = bc_config.value
            fmap = bc_fmap.value
            static_dir = SparkFiles.get(bc_static_model_dir.value)
            ckpt_dir = SparkFiles.get("ckpt")

            from dlflow.mgr import Collector, model
            collect = Collector()
            collect(static_dir, "Static models")

            input_cls = model[config.MODEL.input_name]
            dataset = input_cls(fmap).rdd_inputs(rdd, config.MODEL.batch_size)

            model_cls = model[config.MODEL.model_name]
            model_ins = model_cls(fmap)
            model_ins.load_model(ckpt_dir)

            return model_ins.predict_act(dataset)
示例#8
0
文件: context.py 项目: dcobb/spark
    def __init__(self, master, jobName, sparkHome=None, pyFiles=None,
        environment=None, batchSize=1024):
        """
        Create a new SparkContext.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param jobName: A name for your job, to display on the cluster web UI
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        """
        with SparkContext._lock:
            if SparkContext._active_spark_context:
                raise ValueError("Cannot run multiple SparkContexts at once")
            else:
                SparkContext._active_spark_context = self
        self.master = master
        self.jobName = jobName
        self.sparkHome = sparkHome or None # None becomes null in Py4J
        self.environment = environment or {}
        self.batchSize = batchSize  # -1 represents a unlimited batch size

        # Create the Java SparkContext through Py4J
        empty_string_array = self.gateway.new_array(self.jvm.String, 0)
        self._jsc = self.jvm.JavaSparkContext(master, jobName, sparkHome,
                                              empty_string_array)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
                self.jvm.java.util.ArrayList(),
                self.jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        # Deploy any code dependencies specified in the constructor
        for path in (pyFiles or []):
            self.addPyFile(path)
        SparkFiles._sc = self
        sys.path.append(SparkFiles.getRootDirectory())
示例#9
0
 def execute(self):
     """
     Overrides the execute method of the Reader class to read the file from Http/Https location.
     :return: pyspark.sql.DataFrame
                     Returns a Spark's DataFrame with the data from the specified file.
     """
     try:
         spark_context = self.spark.sparkContext
         spark_context.addFile(self.location)
         return self.spark.read.format(self.file_format) \
             .load(SparkFiles.get(self.location.split('/')[-1]))
     except AnalysisException as exp:
         raise
示例#10
0
    def addPyFile(self, path):
        """
        Add a .py or .zip dependency for all tasks to be executed on this
        SparkContext in the future.  The C{path} passed can be either a local
        file, a file in HDFS (or other Hadoop-supported filesystems), or an
        HTTP, HTTPS or FTP URI.
        """
        self.addFile(path)
        (dirname, filename) = os.path.split(path) # dirname may be directory or HDFS/S3 prefix

        if filename.endswith('.zip') or filename.endswith('.ZIP') or filename.endswith('.egg'):
            self._python_includes.append(filename)
            sys.path.append(os.path.join(SparkFiles.getRootDirectory(), filename)) # for tests in local mode
示例#11
0
def _get_port():
    root_dir = SparkFiles.getRootDirectory()
    path = os.path.join(root_dir, "gateway_port")
    try:
        with open(path) as f:
            port = int(f.readline())
    except IOError as e:
        traceback.print_exc()
        raise RuntimeError("Could not open the file %s, which contains the listening port of"
                           " local Java Gateway, please make sure the init_executor_gateway()"
                           " function is called before any call of java function on the"
                           " executor side." % e.filename)
    return port
示例#12
0
    def addPyFile(self, path):
        """
        Add a .py or .zip dependency for all tasks to be executed on this
        SparkContext in the future.  The C{path} passed can be either a local
        file, a file in HDFS (or other Hadoop-supported filesystems), or an
        HTTP, HTTPS or FTP URI.
        """
        self.addFile(path)
        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix

        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
            self._python_includes.append(filename)
            # for tests in local mode
            sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
示例#13
0
    def addPyFile(self, path):
        """
        Add a .py or .zip dependency for all tasks to be executed on this
        SparkContext in the future.  The C{path} passed can be either a local
        file, a file in HDFS (or other Hadoop-supported filesystems), or an
        HTTP, HTTPS or FTP URI.
        """
        self.addFile(path)
        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix

        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
            self._python_includes.append(filename)
            # for tests in local mode
            sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
示例#14
0
    def addPyFile(self, path):
        """
        Add a .py or .zip dependency for all tasks to be executed on this
        SparkContext in the future.  The C{path} passed can be either a local
        file, a file in HDFS (or other Hadoop-supported filesystems), or an
        HTTP, HTTPS or FTP URI.

        .. note:: A path can be added only once. Subsequent additions of the same path are ignored.
        """
        self.addFile(path)
        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
            self._python_includes.append(filename)
            # for tests in local mode
            sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
        if sys.version > '3':
            import importlib
            importlib.invalidate_caches()
示例#15
0
    def addPyFile(self, path):
        """
        Add a .py or .zip dependency for all tasks to be executed on this
        SparkContext in the future.  The C{path} passed can be either a local
        file, a file in HDFS (or other Hadoop-supported filesystems), or an
        HTTP, HTTPS or FTP URI.

        .. note:: A path can be added only once. Subsequent additions of the same path are ignored.
        """
        self.addFile(path)
        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix
        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
            self._python_includes.append(filename)
            # for tests in local mode
            sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
        if sys.version > '3':
            import importlib
            importlib.invalidate_caches()
示例#16
0
    def addPyFile(self, path):
        """
        Add a .py or .zip dependency for all tasks to be executed on this
        SparkContext in the future.  The C{path} passed can be either a local
        file, a file in HDFS (or other Hadoop-supported filesystems), or an
        HTTP, HTTPS or FTP URI.
		为以后在这个SparkContext上执行的所有任务添加需要的.py或.zip依赖项。
		{path}可以是本地的文件,HDFS(或其他Hadoop支持的文件系统)中的文件,或者HTTP,HTTPS或FTP URI。
        """
        self.addFile(path)
        (dirname, filename) = os.path.split(path)  # dirname may be directory or HDFS/S3 prefix 目录名可能是目录或hdfs/s3前缀
        if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:#如果文件名的后缀在('.zip', '.egg', '.jar')中则将该文件添加到路径中去
            self._python_includes.append(filename)
            # for tests in local mode本地测试
            sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))
        if sys.version > '3':
            import importlib
            importlib.invalidate_caches()
示例#17
0
    def get_or_load(archive_path):
        """Given a path returned by add_local_model(), this method will return the loaded model.
        If this Python process ever loaded the model before, we will reuse that copy.
        """
        if archive_path in SparkModelCache._models:
            SparkModelCache._cache_hits += 1
            return SparkModelCache._models[archive_path]

        local_path = SparkFiles.get(archive_path)
        temp_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(local_path, 'r')
        zip_ref.extractall(temp_dir)
        zip_ref.close()

        # We must rely on a supposed cyclic import here because we want this behavior
        # on the Spark Executors (i.e., don't try to pickle the load_model function).
        from mlflow.pyfunc import load_pyfunc  # pylint: disable=cyclic-import
        SparkModelCache._models[archive_path] = load_pyfunc(temp_dir)
        return SparkModelCache._models[archive_path]
    def get_or_load(archive_path):
        """Given a path returned by add_local_model(), this method will return the loaded model.
        If this Python process ever loaded the model before, we will reuse that copy.
        """
        if archive_path in SparkModelCache._models:
            SparkModelCache._cache_hits += 1
            return SparkModelCache._models[archive_path]

        # BUG: Despite the documentation of SparkContext.addFile() and SparkFiles.get() in Scala
        # and Python, it turns out that we actually need to use the basename as the input to
        # SparkFiles.get(), as opposed to the (absolute) path.
        archive_path_basename = os.path.basename(archive_path)
        local_path = SparkFiles.get(archive_path_basename)
        temp_dir = tempfile.mkdtemp()
        zip_ref = zipfile.ZipFile(local_path, 'r')
        zip_ref.extractall(temp_dir)
        zip_ref.close()

        # We must rely on a supposed cyclic import here because we want this behavior
        # on the Spark Executors (i.e., don't try to pickle the load_model function).
        from mlflow.pyfunc import load_pyfunc  # pylint: disable=cyclic-import
        SparkModelCache._models[archive_path] = load_pyfunc(temp_dir)
        return SparkModelCache._models[archive_path]
示例#19
0
文件: context.py 项目: Ludwsam/spark
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc):
        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        elif batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.iteritems():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename.lower().endswith("zip") or filename.lower().endswith("egg"):
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()

        # profiling stats collected for each PythonRDD
        self._profile_stats = []
示例#20
0
    def __init__(self, master, jobName, sparkHome=None, pyFiles=None,
        environment=None, batchSize=1024, serializer=PickleSerializer()):
        """
        Create a new SparkContext.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param jobName: A name for your job, to display on the cluster web UI
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        SparkContext._ensure_initialized(self)

        self.master = master
        self.jobName = jobName
        self.sparkHome = sparkHome or None # None becomes null in Py4J
        self.environment = environment or {}
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Create the Java SparkContext through Py4J
        empty_string_array = self._gateway.new_array(self._jvm.String, 0)
        self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
                                              empty_string_array)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
                self._jvm.java.util.ArrayList(),
                self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.append(root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir()
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
示例#21
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc, profiler_cls):
        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v
        if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
            # disable randomness of hash of string in worker, if this is not
            # launched by spark-submit
            self.environment["PYTHONHASHSEED"] = "0"

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)
        else:
            self.profiler_collector = None
示例#22
0
    def __init__(self,
                 master=None,
                 appName=None,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=1024,
                 serializer=PickleSerializer(),
                 conf=None,
                 gateway=None):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        if rdd._extract_concise_traceback() is not None:
            self._callsite = rdd._extract_concise_traceback()
        else:
            tempNamedTuple = namedtuple("Callsite", "function file linenum")
            self._callsite = tempNamedTuple(function=None,
                                            file=None,
                                            linenum=None)
        SparkContext._ensure_initialized(self, gateway=gateway)

        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.iteritems():
                self._conf.setExecutorEnv(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception(
                "An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        # Create the Java SparkContext through Py4J
        self._jsc = self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.append(root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
            self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
示例#23
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment,
                 batchSize, serializer, conf, jsc, profiler_cls):
        self.environment = environment or {}
        # java gateway must have been launched at this point.
        if conf is not None and conf._jconf is not None:
            # conf has been initialized in JVM properly, so use conf directly. This represents the
            # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
            # created and then stopped, and we create a new SparkConf and new SparkContext again)
            self._conf = conf
        else:
            self._conf = SparkConf(_jvm=SparkContext._jvm)
            if conf is not None:
                for k, v in conf.getAll():
                    self._conf.set(k, v)

        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception(
                "An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        self.environment["PYTHONHASHSEED"] = os.environ.get(
            "PYTHONHASHSEED", "0")

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        auth_token = self._gateway.gateway_parameters.auth_token
        self._accumulatorServer = accumulators._start_update_server(auth_token)
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jvm.PythonAccumulatorV2(
            host, port, auth_token)
        self._jsc.sc().register(self._javaAccumulator)

        # If encryption is enabled, we need to setup a server in the jvm to read broadcast
        # data via a socket.
        # scala's mangled names w/ $ in them require special treatment.
        self._encryption_enabled = self._jvm.PythonUtils.isEncryptionEnabled(
            self._jsc)

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        self.pythonVer = "%d.%d" % sys.version_info[:2]

        if sys.version_info < (3, 0):
            with warnings.catch_warnings():
                warnings.simplefilter("once")
                warnings.warn(
                    "Support for Python 2 is deprecated as of Spark 3.0. "
                    "See the plan for dropping Python 2 support at "
                    "https://spark.apache.org/news/plan-for-dropping-python-2-support.html.",
                    DeprecationWarning)

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = BroadcastPickleRegistry()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                try:
                    filepath = os.path.join(SparkFiles.getRootDirectory(),
                                            filename)
                    if not os.path.exists(filepath):
                        # In case of YARN with shell mode, 'spark.submit.pyFiles' files are
                        # not added via SparkContext.addFile. Here we check if the file exists,
                        # try to copy and then add it to the path. See SPARK-21945.
                        shutil.copyfile(path, filepath)
                    if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                        self._python_includes.append(filename)
                        sys.path.insert(1, filepath)
                except Exception:
                    warnings.warn(
                        "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to "
                        "Python path:\n  %s" % (path, "\n  ".join(sys.path)),
                        RuntimeWarning)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
            self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(
                profiler_cls, dump_path)
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
        def signal_handler(signal, frame):
            self.cancelAllJobs()
            raise KeyboardInterrupt()

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)
示例#24
0
def _get_port():
    root_dir = SparkFiles.getRootDirectory()
    path = os.path.join(root_dir, "gateway_port")
    f = open(path)
    port = int(f.readline())
    return port
示例#25
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc, profiler_cls):
        self.environment = environment or {}#要在工作节点上设置的环境变量
        # java gateway must have been launched at this point.
		#这个时候java的网关必须已经启动
        if conf is not None and conf._jconf is not None:
            # conf has been initialized in JVM properly, so use conf directly. This represent the
            # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
            # created and then stopped, and we create a new SparkConf and new SparkContext again)
			#conf已在JVM上被正确初始化,所以直接使用的conf。
			#这说明在创建SparkConf之前已经启动了JVM(比如SparkContext被创建了,然后又被停止了,那么我们再创建一个新的SparkConf和新的SparkContext)
            self._conf = conf
        else:
            self._conf = SparkConf(_jvm=SparkContext._jvm)
            if conf is not None:
                for k, v in conf.getAll():
                    self._conf.set(k, v)

        self._batchSize = batchSize  # -1 represents an unlimited batch size :-1表示没有限制块大小
        self._unbatched_serializer = serializer #没有分块的序列化对象
        if batchSize == 0:#如果块大小为0(不知道分块大小)则将序列化对象自动分块,否则为已分块的序列化对象
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf 设置在conf上直接传给我们的参数
        if master:
            self._conf.setMaster(master)#配置master属性
        if appName:
            self._conf.setAppName(appName)#配置appName属性
        if sparkHome:
            self._conf.setSparkHome(sparkHome)#配置sparkHome属性
        if environment:#配置环境
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters 
		#检查我们是否有必须具备的参数,从这里可以看出master和appName属性是最小配置,是必须要配置的
        if not self._conf.contains("spark.master"):#检查是否配置了master属性,如果没有就抛出异常提示
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):#检查是否配置了appName属性,如果没有就抛出异常提示
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
		#从conf中读回我们需要的属性,以防我们从其他的一些classpath或者外部的配置文件中加载这些属性并被覆盖
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")

        # Create the Java SparkContext through Py4J
		#通过Py4J创建Java SparkContext
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
		#将SparkConf重置为JVM中SparkContext实际使用的那个。
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
		#用Java创建一个单一的Accumulator,以便我们发送所有更新
		#他们将通过TCP服务器传回给我们
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
        self._jsc.sc().register(self._javaAccumulator)

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')#python执行器的目录
        self.pythonVer = "%d.%d" % sys.version_info[:2]#python的版本

        if sys.version_info < (2, 7):#如果python版本小于2.7则给出警告:提示spark2.0.0已经不支持python2.6了
            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")

        # Broadcast's __reduce__ method stores Broadcast instances here.广播的__reduce__方法在这里存储广播实例。
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
		#它允许其他代码来确定哪些广播实例已经被序列化了,所以它可以确定发送哪个Java广播对象
        self._pickled_broadcast_vars = BroadcastPickleRegistry()#在Thread-local中注册已被序列化的广播变量

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)#sys.path是个列表,用sys.path.append在末尾添加目录,当这个append执行完之后,新目录即时起效

        # Deploy any code dependencies specified in the constructor
		#部署构造函数中指定的任何代码依赖关系
        self._python_includes = list()
        for path in (pyFiles or []):#为以后在这个SparkContext上执行的所有任务添加需要的.py或.zip依赖项。						
            self.addPyFile(path)#{path}可以是本地的文件,HDFS(或其他Hadoop支持的文件系统)中的文件,或者HTTP,HTTPS或FTP URI。

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
		#部署由spark-submit设置的代码依赖项; 
		#这些依赖需要已经被SparkContext.addFile方法添加进去了,所以我们只需要将它们添加到PYTHONPATH中就可以了
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):#通过逗号分割所需要的文件目录
            if path != "":#如果文件不为空,则分离出文件的目录和文件名字
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:#如果文件后缀在('.zip', '.egg', '.jar')中,则将该文件名添加到相关依赖的列表中
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))#把下载后的文件(目录/文件名)添加到第1个位置上

        # Create a temporary directory inside spark.local.dir:
		#在spark.local.dir中创建一个临时目录
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
		#为每个PythonRDD解析stats
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)#在每个基础的stage中跟踪不同的解析器。
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
		#创建一个信号处理程序,它将在接收到SIGINT时被调用
        def signal_handler(signal, frame):
            self.cancelAllJobs()#取消已安排或正在运行的所有作业
            raise KeyboardInterrupt()#抛出用户中断执行的异常

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)#预设信号处理函数singnal.signal(signalnum, handler)其中第一个参数是信号量,第二个参数信号处理函数。
示例#26
0
    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
        gateway=None):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        if rdd._extract_concise_traceback() is not None:
            self._callsite = rdd._extract_concise_traceback()
        else:
            tempNamedTuple = namedtuple("Callsite", "function file linenum")
            self._callsite = tempNamedTuple(function=None, file=None, linenum=None)
        SparkContext._ensure_initialized(self, gateway=gateway)

        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.iteritems():
                self._conf.setExecutorEnv(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        # Create the Java SparkContext through Py4J
        self._jsc = self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
                self._jvm.java.util.ArrayList(),
                self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.append(root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
示例#27
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc, profiler_cls):
        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        # Let YARN know it's a pyspark app, so it distributes needed libraries.
        if self.master == "yarn-client":
            self._conf.set("spark.yarn.isPython", "true")

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v
        if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
            # disable randomness of hash of string in worker, if this is not
            # launched by spark-submit
            self.environment["PYTHONHASHSEED"] = "0"

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        self.pythonVer = "%d.%d" % sys.version_info[:2]

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
        def signal_handler(signal, frame):
            self.cancelAllJobs()

        signal.signal(signal.SIGINT, signal_handler)
示例#28
0
from pyspark.files import SparkFiles

if __name__ == '__main__':
    '''
        Usage: xnor_spark
    '''

    spark = SparkSession\
            .builder\
            .appName('xnor_spark')\
            .getOrCreate()

    spark.sparkContext.setLogLevel('ERROR')

    serialized = example.Serialized()
    print(SparkFiles.get('key.pub'))
    example.ReadSerializationFromFile(SparkFiles.get('key.pub'), serialized)
    m = hashlib.sha1()
    pubKey = example.SerializationToString(serialized, '')[1]
    m.update(pubKey.encode('utf-8'))
    dirName = m.hexdigest()[:3]
    print(dirName)

    # The "database"
    start = time.time()
    print('Reading files...')
    df = spark.read.text(glob.glob('./'+dirName+'/server/*.enc'))
    end = time.time()
    print('...took', end - start)
    df.show()
    print(df.rdd.getNumPartitions(), "partitions")
示例#29
0
    def __init__(self,
                 master,
                 jobName,
                 sparkHome=None,
                 pyFiles=None,
                 environment=None,
                 batchSize=1024):
        """
        Create a new SparkContext.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param jobName: A name for your job, to display on the cluster web UI
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        """
        with SparkContext._lock:
            if SparkContext._active_spark_context:
                raise ValueError("Cannot run multiple SparkContexts at once")
            else:
                SparkContext._active_spark_context = self
                if not SparkContext._gateway:
                    SparkContext._gateway = launch_gateway()
                    SparkContext._jvm = SparkContext._gateway.jvm
                    SparkContext._writeIteratorToPickleFile = \
                        SparkContext._jvm.PythonRDD.writeIteratorToPickleFile
                    SparkContext._takePartition = \
                        SparkContext._jvm.PythonRDD.takePartition
        self.master = master
        self.jobName = jobName
        self.sparkHome = sparkHome or None  # None becomes null in Py4J
        self.environment = environment or {}
        self.batchSize = batchSize  # -1 represents a unlimited batch size

        # Create the Java SparkContext through Py4J
        empty_string_array = self._gateway.new_array(self._jvm.String, 0)
        self._jsc = self._jvm.JavaSparkContext(master, jobName, sparkHome,
                                               empty_string_array)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        # Deploy any code dependencies specified in the constructor
        for path in (pyFiles or []):
            self.addPyFile(path)
        SparkFiles._sc = self
        sys.path.append(SparkFiles.getRootDirectory())

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.spark.Utils.getLocalDir()
        self._temp_dir = \
            self._jvm.spark.Utils.createTempDir(local_dir).getAbsolutePath()
示例#30
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc, profiler_cls):
        self.environment = environment or {}
        # java gateway must have been launched at this point.
        if conf is not None and conf._jconf is not None:
            # conf has been initialized in JVM properly, so use conf directly. This represent the
            # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
            # created and then stopped, and we create a new SparkConf and new SparkContext again)
            self._conf = conf
        else:
            self._conf = SparkConf(_jvm=SparkContext._jvm)

        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v
        if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
            # disable randomness of hash of string in worker, if this is not
            # launched by spark-submit
            self.environment["PYTHONHASHSEED"] = "0"

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
        self._jsc.sc().register(self._javaAccumulator)

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        self.pythonVer = "%d.%d" % sys.version_info[:2]

        if sys.version_info < (2, 7):
            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
        def signal_handler(signal, frame):
            self.cancelAllJobs()
            raise KeyboardInterrupt()

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)
示例#31
0
def get_file_path(name):
    return Path(SparkFiles.get(name))
示例#32
0
文件: context.py 项目: AllenShi/spark
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc, profiler_cls):
        self.environment = environment or {}
        # java gateway must have been launched at this point.
        if conf is not None and conf._jconf is not None:
            # conf has been initialized in JVM properly, so use conf directly. This represent the
            # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
            # created and then stopped, and we create a new SparkConf and new SparkContext again)
            self._conf = conf
        else:
            self._conf = SparkConf(_jvm=SparkContext._jvm)
            if conf is not None:
                for k, v in conf.getAll():
                    self._conf.set(k, v)

        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
        self._jsc.sc().register(self._javaAccumulator)

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        self.pythonVer = "%d.%d" % sys.version_info[:2]

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = BroadcastPickleRegistry()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
        def signal_handler(signal, frame):
            self.cancelAllJobs()
            raise KeyboardInterrupt()

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)
from pyspark.sql import SparkSession
from pyspark.files import SparkFiles
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from pyspark.sql.types import StringType, StructType, ArrayType, StructField, Row
from collections import OrderedDict
import nltk

nltk.download()

spark = SparkSession \
    .builder \
    .appName("Python NLTK example") \
    .getOrCreate()

classifier_path = SparkFiles.get("english.all.3class.distsim.crf.ser.gz")
ner_jar = SparkFiles.get("stanford-ner.jar")
st = StanfordNERTagger(classifier_path, ner_jar, encoding='utf-8')


def classify_text(row):
    text = row["body_t"]
    # Collapse array to single value field
    all_text = str.join(' ', text)
    tokenized_text = word_tokenize(all_text)
    classified_text = st.tag(tokenized_text)
    classifiers_dict = dict()
    for word, clz in classified_text:
        if clz in classifiers_dict:
            classifiers_dict[clz].add(word)
        else: