def textFile(self, name, minPartitions=None): """ Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings. >>> path = os.path.join(tempdir, "sample-text.txt") >>> with open(path, "w") as testFile: ... testFile.write("Hello world!") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello world!'] """ minPartitions = minPartitions or min(self.defaultParallelism, 2) return RDD(self._jsc.textFile(name, minPartitions), self, UTF8Deserializer())
def binaryFiles(self, path, minPartitions=None): """ .. note:: Experimental Read a directory of binary files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. .. note:: Small files are preferred, large file is also allowable, but may cause bad performance. """ minPartitions = minPartitions or self.defaultMinPartitions return RDD(self._jsc.binaryFiles(path, minPartitions), self, PairDeserializer(UTF8Deserializer(), NoOpSerializer()))
def socketTextStream(self, hostname, port, storageLevel=StorageLevel.MEMORY_AND_DISK_2): """ Create an input from TCP source hostname:port. Data is received using a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited lines. @param hostname: Hostname to connect to for receiving data @param port: Port to connect to for receiving data @param storageLevel: Storage level to use for storing the received objects """ jlevel = self._sc._getJavaStorageLevel(storageLevel) return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self, UTF8Deserializer())
def createStream(ssc, brokerUrl, topic, storageLevel=StorageLevel.MEMORY_AND_DISK_2): """ Create an input stream that pulls messages from a Mqtt Broker. :param ssc: StreamingContext object :param brokerUrl: Url of remote mqtt publisher :param topic: topic name to subscribe to :param storageLevel: RDD storage level. :return: A DStream object """ jlevel = ssc._sc._getJavaStorageLevel(storageLevel) helper = MQTTUtils._get_helper(ssc._sc) jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel) return DStream(jstream, ssc, UTF8Deserializer())
def _load_from_socket(port, auth_secret): """ Load data from a given socket, this is a blocking method thus only return when the socket connection has been closed. This is copied from context.py, while modified the message protocol. """ sock = None # Support for both IPv4 and IPv6. # On most of IPv6-ready systems, IPv6 will take precedence. for res in socket.getaddrinfo("localhost", port, socket.AF_UNSPEC, socket.SOCK_STREAM): af, socktype, proto, canonname, sa = res sock = socket.socket(af, socktype, proto) try: # Do not allow timeout for socket reading operation. sock.settimeout(None) sock.connect(sa) except socket.error: sock.close() sock = None continue break if not sock: raise Exception("could not open socket") # We don't really need a socket file here, it's just for convenience that we can reuse the # do_server_auth() function and data serialization methods. sockfile = sock.makefile("rwb", 65536) # Make a barrier() function call. write_int(BARRIER_FUNCTION, sockfile) sockfile.flush() # Do server auth. do_server_auth(sockfile, auth_secret) # Collect result. res = UTF8Deserializer().loads(sockfile) # Release resources. sockfile.close() sock.close() return res
def binaryFiles(self, path, minPartitions=None): """ .. note:: Experimental Read a directory of binary files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file. 从HDFS、或者本地文件系统(所有节点可用)、或任何hadoop支持的文件系统URI,读取一个二进制文件的目录。 每个文件都被读取为单个记录,并以键-值对返回,其中键是每个文件的路径, value是每个文件的内容。 .. note:: Small files are preferred, large file is also allowable, but may cause bad performance. """ minPartitions = minPartitions or self.defaultMinPartitions#获取最小的分区数 return RDD(self._jsc.binaryFiles(path, minPartitions), self, PairDeserializer(UTF8Deserializer(), NoOpSerializer()))
def _load_from_socket(port, auth_secret): """ Load data from a given socket, this is a blocking method thus only return when the socket connection has been closed. """ (sockfile, sock) = local_connect_and_auth(port, auth_secret) # The barrier() call may block forever, so no timeout sock.settimeout(None) # Make a barrier() function call. write_int(BARRIER_FUNCTION, sockfile) sockfile.flush() # Collect result. res = UTF8Deserializer().loads(sockfile) # Release resources. sockfile.close() sock.close() return res
def textFile(self, name, minPartitions=None, use_unicode=True): """ Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings. If use_unicode is False, the strings will be kept as `str` (encoding as `utf-8`), which is faster and smaller than unicode. (Added in Spark 1.2) >>> path = os.path.join(tempdir, "sample-text.txt") >>> with open(path, "w") as testFile: ... _ = testFile.write("Hello world!") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello world!'] """ minPartitions = minPartitions or min(self.defaultParallelism, 2) return RDD(self._jsc.textFile(name, minPartitions), self, UTF8Deserializer(use_unicode))
def load_stream(self, stream): """ Load a stream of un-ordered Arrow RecordBatches, where the last iteration yields a list of indices that can be used to put the RecordBatches in the correct order. """ # load the batches for batch in self.serializer.load_stream(stream): yield batch # load the batch order indices or propagate any error that occurred in the JVM num = read_int(stream) if num == -1: error_msg = UTF8Deserializer().loads(stream) raise RuntimeError("An error occurred while calling " "ArrowCollectSerializer.load_stream: {}".format(error_msg)) batch_order = [] for i in xrange(num): index = read_int(stream) batch_order.append(index) yield batch_order
def createStream(ssc, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): """ Create an input stream that pulls messages from a Event Hub. :param ssc: StreamingContext object :param storageLevel: RDD storage level. :return: A DStream object """ jlevel = ssc._sc._getJavaStorageLevel(storageLevel) try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("com.ge.predix.predixinsights.eventhub.EventHubUtilsPythonHelper") helper = helperClass.newInstance() jstream = helper.createStream(ssc._jssc, jlevel) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): EHUtils._printErrorMsg(ssc.sparkContext) raise e return DStream(jstream, ssc, UTF8Deserializer())
def createStream(ssc, brokerUrl, topic, storageLevel=StorageLevel): # MEMORY_AND_DISK_SER_2 """ Create an input stream that pulls messages from a Mqtt Broker :param ssc: StreamingContext object :param brokerUrl: Url of remote mqtt publisher :param topic: topic name to subscribe to :param storageLevel: RDD storage level. :return: A DStream object """ jlevel = ssc._sc._getJavaStorageLevel(storageLevel.MEMORY_AND_DISK_SER_2) try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper") helper = helperClass.newInstance() jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel) except Py4JJavaError as e: if 'ClassNotFoundException' in str(e.java_exception): MQTTUtils._printErrorMsg(ssc.sparkContext) raise e return DStream(jstream, ssc, UTF8Deserializer())
def worker(sock, authenticated): """ Called by a worker process after the fork(). """ signal.signal(SIGHUP, SIG_DFL) signal.signal(SIGCHLD, SIG_DFL) signal.signal(SIGTERM, SIG_DFL) # restore the handler for SIGINT, # it's useful for debugging (show the stacktrace before exit) signal.signal(SIGINT, signal.default_int_handler) # Read the socket using fdopen instead of socket.makefile() because the latter # seems to be very slow; note that we need to dup() the file descriptor because # otherwise writes also cause a seek that makes us miss data on the read side. buffer_size = int(os.environ.get("SPARK_BUFFER_SIZE", 65536)) infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size) outfile = os.fdopen(os.dup(sock.fileno()), "wb", buffer_size) if not authenticated: client_secret = UTF8Deserializer().loads(infile) if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret: write_with_length("ok".encode("utf-8"), outfile) outfile.flush() else: write_with_length("err".encode("utf-8"), outfile) outfile.flush() sock.close() return 1 exit_code = 0 try: worker_main(infile, outfile) except SystemExit as exc: exit_code = compute_real_exit_code(exc.code) finally: try: outfile.flush() except Exception: pass return exit_code
def textFile(self, name, minPartitions=None, use_unicode=True): """ Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings. 从HDFS、或者本地文件系统(所有节点可用)、或任何hadoop支持的文件系统URI,读取一个文本文件,并将其作为一个字符串的RDD返回。 If use_unicode is False, the strings will be kept as `str` (encoding as `utf-8`), which is faster and smaller than unicode. (Added in Spark 1.2) 如果use_unicode设置为False,则字符串将保持为‘str’(encoding as `utf-8`)类型,它比unicode更快、更小。(spark1.2添加的) >>> path = os.path.join(tempdir, "sample-text.txt") >>> with open(path, "w") as testFile: ... _ = testFile.write("Hello world!") >>> textFile = sc.textFile(path) >>> textFile.collect() [u'Hello world!'] """ minPartitions = minPartitions or min(self.defaultParallelism, 2)#获取最小分区 return RDD(self._jsc.textFile(name, minPartitions), self, UTF8Deserializer(use_unicode))
def createStreams(ssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, numReceivers, accessKeyId, accessKeySecret, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): """ :param ssc: StreamingContext object. :param logServiceProject: The name of `LogService` project. :param logStoreName: The name of logStore. :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group name will consumer specific logStore together. :param loghubEndpoint: The endpoint of loghub. :param numReceivers: The number of receivers. :param accessKeyId: Aliyun Access Key ID. :param accessKeySecret: Aliyun Access Key Secret. :param storageLevel: RDD storage level. :return: A DStream object. """ try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper") helper = helperClass.newInstance() jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, numReceivers, accessKeyId, accessKeySecret, jlevel) except Py4JJavaError as e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): LoghubUtils._printErrorMsg() raise e return DStream(jstream, ssc, UTF8Deserializer())
def _load_from_socket( port: Optional[Union[str, int]], auth_secret: str, function: int, all_gather_message: Optional[str] = None, ) -> List[str]: """ Load data from a given socket, this is a blocking method thus only return when the socket connection has been closed. """ (sockfile, sock) = local_connect_and_auth(port, auth_secret) # The call may block forever, so no timeout sock.settimeout(None) if function == BARRIER_FUNCTION: # Make a barrier() function call. write_int(function, sockfile) elif function == ALL_GATHER_FUNCTION: # Make a all_gather() function call. write_int(function, sockfile) write_with_length( cast(str, all_gather_message).encode("utf-8"), sockfile) else: raise ValueError("Unrecognized function type") sockfile.flush() # Collect result. len = read_int(sockfile) res = [] for i in range(len): res.append(UTF8Deserializer().loads(sockfile)) # Release resources. sockfile.close() sock.close() return res
def socketTextStream( self, hostname: str, port: int, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2 ) -> "DStream[str]": """ Create an input from TCP source hostname:port. Data is received using a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited lines. Parameters ---------- hostname : str Hostname to connect to for receiving data port : int Port to connect to for receiving data storageLevel : :class:`pyspark.StorageLevel`, optional Storage level to use for storing the received objects """ jlevel = self._sc._getJavaStorageLevel(storageLevel) return DStream(self._jssc.socketTextStream(hostname, port, jlevel), self, UTF8Deserializer())
def createPullingStreamAsRawBytes(ssc, queueName, accessKeyId, accessKeySecret, endpoint, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): """ :param ssc: StreamingContext object. :param queueName: The name of MNS queue. :param accessKeyId: Aliyun Access Key ID. :param accessKeySecret: Aliyun Access Key Secret. :param endpoint: The endpoint of MNS service. :param storageLevel: RDD storage level. :return: A DStream object. """ try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.aliyun.mns.MnsUtilsHelper") helper = helperClass.newInstance() jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jstream = helper.createPullingStreamAsRawBytes(ssc._jssc, queueName, accessKeyId, accessKeySecret, endpoint, jlevel) except Py4JJavaError as e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): MnsUtils._printErrorMsg() raise e return DStream(jstream, ssc, UTF8Deserializer())
def createStream(ssc, brokerUrl, topic, storageLevel=StorageLevel.MEMORY_AND_DISK_2): """ Create an input stream that pulls messages from a Mqtt Broker. :param ssc: StreamingContext object :param brokerUrl: Url of remote mqtt publisher :param topic: topic name to subscribe to :param storageLevel: RDD storage level. :return: A DStream object """ try: helper = ssc._jvm.org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper( ) except TypeError as e: if str(e) == "'JavaPackage' object is not callable": MQTTUtils._printErrorMsg(ssc.sparkContext) raise jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel) return DStream(jstream, ssc, UTF8Deserializer())
def launch_gateway(conf=None): """ launch jvm gateway :param conf: spark configuration passed to spark-submit :return: """ if "PYSPARK_GATEWAY_PORT" in os.environ: gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"]) gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"] else: SPARK_HOME = _find_spark_home() # Launch the Py4j gateway using Spark's run command so that we pick up the # proper classpath and settings from spark-env.sh on_windows = platform.system() == "Windows" script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit" command = [os.path.join(SPARK_HOME, script)] if conf: for k, v in conf.getAll(): command += ['--conf', '%s=%s' % (k, v)] submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") if os.environ.get("SPARK_TESTING"): submit_args = ' '.join([ "--conf spark.ui.enabled=false", submit_args ]) command = command + shlex.split(submit_args) # Create a temporary directory where the gateway server should write the connection # information. conn_info_dir = tempfile.mkdtemp() try: fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir) os.close(fd) os.unlink(conn_info_file) env = dict(os.environ) env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file # Launch the Java gateway. # We open a pipe to stdin so that the Java gateway can die when the pipe is broken if not on_windows: # Don't send ctrl-c / SIGINT to the Java gateway: def preexec_func(): signal.signal(signal.SIGINT, signal.SIG_IGN) proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env) else: # preexec_fn not supported on Windows proc = Popen(command, stdin=PIPE, env=env) # Wait for the file to appear, or for the process to exit, whichever happens first. while not proc.poll() and not os.path.isfile(conn_info_file): time.sleep(0.1) if not os.path.isfile(conn_info_file): raise Exception("Java gateway process exited before sending its port number") with open(conn_info_file, "rb") as info: gateway_port = read_int(info) gateway_secret = UTF8Deserializer().loads(info) finally: shutil.rmtree(conn_info_dir) # In Windows, ensure the Java child processes do not linger after Python has exited. # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when # the parent process' stdin sends an EOF). In Windows, however, this is not possible # because java.lang.Process reads directly from the parent process' stdin, contending # with any opportunity to read an EOF from the parent. Note that this is only best # effort and will not take effect if the python process is violently terminated. if on_windows: # In Windows, the child process here is "spark-submit.cmd", not the JVM itself # (because the UNIX "exec" command is not available). This means we cannot simply # call proc.kill(), which kills only the "spark-submit.cmd" process but not the # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx) def killChild(): Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)]) atexit.register(killChild) # Connect to the gateway gateway = JavaGateway( gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret, auto_convert=True)) # Import the classes used by PySpark java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.ml.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") # TODO(davies): move into sql java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.api.python.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") java_import(gateway.jvm, "scala.Tuple2") return gateway
def main(infile, outfile): try: boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests sys.exit(-1) version = utf8_deserializer.loads(infile) if version != "%d.%d" % sys.version_info[:2]: raise Exception(("Python in worker has different version %s than that in " + "driver %s, PySpark cannot run with different minor versions." + "Please check environment variables PYSPARK_PYTHON and " + "PYSPARK_DRIVER_PYTHON are correctly set.") % ("%d.%d" % sys.version_info[:2], version)) # read inputs only for a barrier task isBarrier = read_bool(infile) boundPort = read_int(infile) secret = UTF8Deserializer().loads(infile) # set up memory limits memory_limit_mb = int(os.environ.get('PYSPARK_EXECUTOR_MEMORY_MB', "-1")) if memory_limit_mb > 0 and has_resource_module: total_memory = resource.RLIMIT_AS try: (soft_limit, hard_limit) = resource.getrlimit(total_memory) msg = "Current mem limits: {0} of max {1}\n".format(soft_limit, hard_limit) print(msg, file=sys.stderr) # convert to bytes new_limit = memory_limit_mb * 1024 * 1024 if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit: msg = "Setting mem limits to {0} of max {1}\n".format(new_limit, new_limit) print(msg, file=sys.stderr) resource.setrlimit(total_memory, (new_limit, new_limit)) except (resource.error, OSError, ValueError) as e: # not all systems support resource limits, so warn instead of failing print("WARN: Failed to set memory limit: {0}\n".format(e), file=sys.stderr) # initialize global state taskContext = None if isBarrier: taskContext = BarrierTaskContext._getOrCreate() BarrierTaskContext._initialize(boundPort, secret) else: taskContext = TaskContext._getOrCreate() # read inputs for TaskContext info taskContext._stageId = read_int(infile) taskContext._partitionId = read_int(infile) taskContext._attemptNumber = read_int(infile) taskContext._taskAttemptId = read_long(infile) taskContext._localProperties = dict() for i in range(read_int(infile)): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) taskContext._localProperties[k] = v shuffle.MemoryBytesSpilled = 0 shuffle.DiskBytesSpilled = 0 _accumulatorRegistry.clear() # fetch name of workdir spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH add_path(spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) add_path(os.path.join(spark_files_dir, filename)) if sys.version > '3': import importlib importlib.invalidate_caches() # fetch names and values of broadcast variables needs_broadcast_decryption_server = read_bool(infile) num_broadcast_variables = read_int(infile) if needs_broadcast_decryption_server: # read the decrypted data from a server in the jvm port = read_int(infile) auth_secret = utf8_deserializer.loads(infile) (broadcast_sock_file, _) = local_connect_and_auth(port, auth_secret) for _ in range(num_broadcast_variables): bid = read_long(infile) if bid >= 0: if needs_broadcast_decryption_server: read_bid = read_long(broadcast_sock_file) assert(read_bid == bid) _broadcastRegistry[bid] = \ Broadcast(sock_file=broadcast_sock_file) else: path = utf8_deserializer.loads(infile) _broadcastRegistry[bid] = Broadcast(path=path) else: bid = - bid - 1 _broadcastRegistry.pop(bid) if needs_broadcast_decryption_server: broadcast_sock_file.write(b'1') broadcast_sock_file.close() _accumulatorRegistry.clear() eval_type = read_int(infile) if eval_type == PythonEvalType.NON_UDF: func, profiler, deserializer, serializer = read_command(pickleSer, infile) else: func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type) init_time = time.time() def process(): iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) if profiler: profiler.profile(process) else: process() except Exception: try: write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc().encode("utf-8"), outfile) except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print("PySpark worker failed with exception:", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) sys.exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) write_long(shuffle.MemoryBytesSpilled, outfile) write_long(shuffle.DiskBytesSpilled, outfile) # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) write_int(len(_accumulatorRegistry), outfile) for (aid, accum) in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile) # check end of stream if read_int(infile) == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) else: # write a different value to tell JVM to not reuse this worker write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) sys.exit(-1)
from pyspark.java_gateway import local_connect_and_auth from pyspark.taskcontext import BarrierTaskContext, TaskContext from pyspark.files import SparkFiles from pyspark.rdd import PythonEvalType from pyspark.serializers import write_with_length, write_int, read_long, read_bool, \ write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, \ BatchedSerializer, ArrowStreamPandasSerializer from pyspark.sql.types import to_arrow_type, StructType from pyspark.util import _get_argspec, fail_on_stopiteration from pyspark import shuffle if sys.version >= '3': basestring = str pickleSer = PickleSerializer() utf8_deserializer = UTF8Deserializer() def report_times(outfile, boot, init, finish): write_int(SpecialLengths.TIMING_DATA, outfile) write_long(int(1000 * boot), outfile) write_long(int(1000 * init), outfile) write_long(int(1000 * finish), outfile) def add_path(path): # worker can be used, so donot add path multiple times if path not in sys.path: # overwrite system packages sys.path.insert(1, path)
def createStreams(ssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, numReceivers, accessKeyId, accessKeySecret, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): ''' :param ssc: StreamingContext 对象 :param logServiceProject: LogService的项目名 :param logStoreName: logStore名称 :param loghubConsumerGroupName: The group name of loghub consumer :param loghubEndpoint: :param numReceivers: :param accessKeyId: :param accessKeySecret: :param storageLevel: RDD storage level :return: A DStream Object ''' try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper") helper = helperClass.newInstance() jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, numReceivers, accessKeyId, accessKeySecret, jlevel) except Py4JJavaError as e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): LoghubUtils._printErrorMsg() raise e return DStream(jstream, ssc, UTF8Deserializer()) @staticmethod def createStream(ssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, accessKeyId, accessKeySecret, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): """ :param ssc: StreamingContext object. :param logServiceProject: The name of `LogService` project. :param logStoreName: The name of logStore. :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group name will consumer specific logStore together. :param loghubEndpoint: The endpoint of loghub. :param numReceivers: The number of receivers. :param accessKeyId: Aliyun Access Key ID. :param accessKeySecret: Aliyun Access Key Secret.s :param storageLevel: RDD storage level. :return: A DStream object. """ try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper") helper = helperClass.newInstance() jlevel = ssc._sc._getJavaStorageLevel(storageLevel) jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, accessKeyId, accessKeySecret, jlevel) except Py4JJavaError as e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): LoghubUtils._printErrorMsg() raise e return DStream(jstream, ssc, UTF8Deserializer()) @staticmethod def _printErrorMsg(): print(""" ________________________________________________________________________________________________ E-MapReduce SDK's libraries not found in class path. Try one of the following. 1. Include the 'emr-logservice_2.10' library and its dependencies with in the spark-submit command as $ bin/spark-submit --packages com.aliyun.emr:emr-logservice_2.10:%s ... 2. Download the JAR of the artifact from Maven Central http://search.maven.org/, Group Id = com.aliyun.emr, Artifact Id = emr-logservice_2.10, Version = %s. Then, include the jar in the spark-submit command as $ bin/spark-submit --jars <emr-logservice_2.10-%s.jar> ... ________________________________________________________________________________________________ """ % ('1.4.2-SNAPSHOT', '1.4.2-SNAPSHOT', '1.4.2-SNAPSHOT'))
def launch_gateway(conf=None, popen_kwargs=None): """ launch jvm gateway Parameters ---------- conf : :py:class:`pyspark.SparkConf` spark configuration passed to spark-submit popen_kwargs : dict Dictionary of kwargs to pass to Popen when spawning the py4j JVM. This is a developer feature intended for use in customizing how pyspark interacts with the py4j JVM (e.g., capturing stdout/stderr). Returns ------- ClientServer or JavaGateway """ if "PYSPARK_GATEWAY_PORT" in os.environ: gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"]) gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"] # Process already exists proc = None else: SPARK_HOME = _find_spark_home() # Launch the Py4j gateway using Spark's run command so that we pick up the # proper classpath and settings from spark-env.sh on_windows = platform.system() == "Windows" script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit" command = [os.path.join(SPARK_HOME, script)] if conf: for k, v in conf.getAll(): command += ["--conf", "%s=%s" % (k, v)] submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") if os.environ.get("SPARK_TESTING"): submit_args = " ".join( ["--conf spark.ui.enabled=false", submit_args]) command = command + shlex.split(submit_args) # Create a temporary directory where the gateway server should write the connection # information. conn_info_dir = tempfile.mkdtemp() try: fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir) os.close(fd) os.unlink(conn_info_file) env = dict(os.environ) env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file # Launch the Java gateway. popen_kwargs = {} if popen_kwargs is None else popen_kwargs # We open a pipe to stdin so that the Java gateway can die when the pipe is broken popen_kwargs["stdin"] = PIPE # We always set the necessary environment variables. popen_kwargs["env"] = env if not on_windows: # Don't send ctrl-c / SIGINT to the Java gateway: def preexec_func(): signal.signal(signal.SIGINT, signal.SIG_IGN) popen_kwargs["preexec_fn"] = preexec_func proc = Popen(command, **popen_kwargs) else: # preexec_fn not supported on Windows proc = Popen(command, **popen_kwargs) # Wait for the file to appear, or for the process to exit, whichever happens first. while not proc.poll() and not os.path.isfile(conn_info_file): time.sleep(0.1) if not os.path.isfile(conn_info_file): raise RuntimeError( "Java gateway process exited before sending its port number" ) with open(conn_info_file, "rb") as info: gateway_port = read_int(info) gateway_secret = UTF8Deserializer().loads(info) finally: shutil.rmtree(conn_info_dir) # In Windows, ensure the Java child processes do not linger after Python has exited. # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when # the parent process' stdin sends an EOF). In Windows, however, this is not possible # because java.lang.Process reads directly from the parent process' stdin, contending # with any opportunity to read an EOF from the parent. Note that this is only best # effort and will not take effect if the python process is violently terminated. if on_windows: # In Windows, the child process here is "spark-submit.cmd", not the JVM itself # (because the UNIX "exec" command is not available). This means we cannot simply # call proc.kill(), which kills only the "spark-submit.cmd" process but not the # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx) def killChild(): Popen([ "cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid) ]) atexit.register(killChild) # Connect to the gateway (or client server to pin the thread between JVM and Python) if os.environ.get("PYSPARK_PIN_THREAD", "true").lower() == "true": gateway = ClientServer( java_parameters=JavaParameters(port=gateway_port, auth_token=gateway_secret, auto_convert=True), python_parameters=PythonParameters(port=0, eager_load=False), ) else: gateway = JavaGateway(gateway_parameters=GatewayParameters( port=gateway_port, auth_token=gateway_secret, auto_convert=True)) # Store a reference to the Popen object for use by the caller (e.g., in reading stdout/stderr) gateway.proc = proc # Import the classes used by PySpark java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.ml.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") java_import(gateway.jvm, "org.apache.spark.resource.*") # TODO(davies): move into sql java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.api.python.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") java_import(gateway.jvm, "scala.Tuple2") return gateway
def main(infile, outfile): try: boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests sys.exit(-1) version = utf8_deserializer.loads(infile) if version != "%d.%d" % sys.version_info[:2]: raise Exception( ("Python in worker has different version %s than that in " + "driver %s, PySpark cannot run with different minor versions." + "Please check environment variables PYSPARK_PYTHON and " + "PYSPARK_DRIVER_PYTHON are correctly set.") % ("%d.%d" % sys.version_info[:2], version)) # read inputs only for a barrier task isBarrier = read_bool(infile) boundPort = read_int(infile) secret = UTF8Deserializer().loads(infile) # initialize global state taskContext = None if isBarrier: taskContext = BarrierTaskContext._getOrCreate() BarrierTaskContext._initialize(boundPort, secret) else: taskContext = TaskContext._getOrCreate() # read inputs for TaskContext info taskContext._stageId = read_int(infile) taskContext._partitionId = read_int(infile) taskContext._attemptNumber = read_int(infile) taskContext._taskAttemptId = read_long(infile) taskContext._localProperties = dict() for i in range(read_int(infile)): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) taskContext._localProperties[k] = v shuffle.MemoryBytesSpilled = 0 shuffle.DiskBytesSpilled = 0 _accumulatorRegistry.clear() # fetch name of workdir spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH add_path( spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) add_path(os.path.join(spark_files_dir, filename)) if sys.version > '3': import importlib importlib.invalidate_caches() # fetch names and values of broadcast variables num_broadcast_variables = read_int(infile) for _ in range(num_broadcast_variables): bid = read_long(infile) if bid >= 0: path = utf8_deserializer.loads(infile) _broadcastRegistry[bid] = Broadcast(path=path) else: bid = -bid - 1 _broadcastRegistry.pop(bid) _accumulatorRegistry.clear() eval_type = read_int(infile) if eval_type == PythonEvalType.NON_UDF: func, profiler, deserializer, serializer = read_command( pickleSer, infile) else: func, profiler, deserializer, serializer = read_udfs( pickleSer, infile, eval_type) init_time = time.time() def process(): iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) if profiler: profiler.profile(process) else: process() except Exception: try: write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(traceback.format_exc().encode("utf-8"), outfile) except IOError: # JVM close the socket pass except Exception: # Write the error to stderr if it happened while serializing print("PySpark worker failed with exception:", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) sys.exit(-1) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) write_long(shuffle.MemoryBytesSpilled, outfile) write_long(shuffle.DiskBytesSpilled, outfile) # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) write_int(len(_accumulatorRegistry), outfile) for (aid, accum) in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile) # check end of stream if read_int(infile) == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) else: # write a different value to tell JVM to not reuse this worker write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) sys.exit(-1)
def main(infile, outfile): faulthandler_log_path = os.environ.get("PYTHON_FAULTHANDLER_DIR", None) try: if faulthandler_log_path: faulthandler_log_path = os.path.join(faulthandler_log_path, str(os.getpid())) faulthandler_log_file = open(faulthandler_log_path, "w") faulthandler.enable(file=faulthandler_log_file) boot_time = time.time() split_index = read_int(infile) if split_index == -1: # for unit tests sys.exit(-1) version = utf8_deserializer.loads(infile) if version != "%d.%d" % sys.version_info[:2]: raise RuntimeError( ( "Python in worker has different version %s than that in " + "driver %s, PySpark cannot run with different minor versions. " + "Please check environment variables PYSPARK_PYTHON and " + "PYSPARK_DRIVER_PYTHON are correctly set." ) % ("%d.%d" % sys.version_info[:2], version) ) # read inputs only for a barrier task isBarrier = read_bool(infile) boundPort = read_int(infile) secret = UTF8Deserializer().loads(infile) # set up memory limits memory_limit_mb = int(os.environ.get("PYSPARK_EXECUTOR_MEMORY_MB", "-1")) if memory_limit_mb > 0 and has_resource_module: total_memory = resource.RLIMIT_AS try: (soft_limit, hard_limit) = resource.getrlimit(total_memory) msg = "Current mem limits: {0} of max {1}\n".format(soft_limit, hard_limit) print(msg, file=sys.stderr) # convert to bytes new_limit = memory_limit_mb * 1024 * 1024 if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit: msg = "Setting mem limits to {0} of max {1}\n".format(new_limit, new_limit) print(msg, file=sys.stderr) resource.setrlimit(total_memory, (new_limit, new_limit)) except (resource.error, OSError, ValueError) as e: # not all systems support resource limits, so warn instead of failing lineno = ( getframeinfo(currentframe()).lineno + 1 if currentframe() is not None else 0 ) print( warnings.formatwarning( "Failed to set memory limit: {0}".format(e), ResourceWarning, __file__, lineno, ), file=sys.stderr, ) # initialize global state taskContext = None if isBarrier: taskContext = BarrierTaskContext._getOrCreate() BarrierTaskContext._initialize(boundPort, secret) # Set the task context instance here, so we can get it by TaskContext.get for # both TaskContext and BarrierTaskContext TaskContext._setTaskContext(taskContext) else: taskContext = TaskContext._getOrCreate() # read inputs for TaskContext info taskContext._stageId = read_int(infile) taskContext._partitionId = read_int(infile) taskContext._attemptNumber = read_int(infile) taskContext._taskAttemptId = read_long(infile) taskContext._cpus = read_int(infile) taskContext._resources = {} for r in range(read_int(infile)): key = utf8_deserializer.loads(infile) name = utf8_deserializer.loads(infile) addresses = [] taskContext._resources = {} for a in range(read_int(infile)): addresses.append(utf8_deserializer.loads(infile)) taskContext._resources[key] = ResourceInformation(name, addresses) taskContext._localProperties = dict() for i in range(read_int(infile)): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) taskContext._localProperties[k] = v shuffle.MemoryBytesSpilled = 0 shuffle.DiskBytesSpilled = 0 _accumulatorRegistry.clear() # fetch name of workdir spark_files_dir = utf8_deserializer.loads(infile) SparkFiles._root_directory = spark_files_dir SparkFiles._is_running_on_worker = True # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH add_path(spark_files_dir) # *.py files that were added will be copied here num_python_includes = read_int(infile) for _ in range(num_python_includes): filename = utf8_deserializer.loads(infile) add_path(os.path.join(spark_files_dir, filename)) importlib.invalidate_caches() # fetch names and values of broadcast variables needs_broadcast_decryption_server = read_bool(infile) num_broadcast_variables = read_int(infile) if needs_broadcast_decryption_server: # read the decrypted data from a server in the jvm port = read_int(infile) auth_secret = utf8_deserializer.loads(infile) (broadcast_sock_file, _) = local_connect_and_auth(port, auth_secret) for _ in range(num_broadcast_variables): bid = read_long(infile) if bid >= 0: if needs_broadcast_decryption_server: read_bid = read_long(broadcast_sock_file) assert read_bid == bid _broadcastRegistry[bid] = Broadcast(sock_file=broadcast_sock_file) else: path = utf8_deserializer.loads(infile) _broadcastRegistry[bid] = Broadcast(path=path) else: bid = -bid - 1 _broadcastRegistry.pop(bid) if needs_broadcast_decryption_server: broadcast_sock_file.write(b"1") broadcast_sock_file.close() _accumulatorRegistry.clear() eval_type = read_int(infile) if eval_type == PythonEvalType.NON_UDF: func, profiler, deserializer, serializer = read_command(pickleSer, infile) else: func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type) init_time = time.time() def process(): iterator = deserializer.load_stream(infile) out_iter = func(split_index, iterator) try: serializer.dump_stream(out_iter, outfile) finally: if hasattr(out_iter, "close"): out_iter.close() if profiler: profiler.profile(process) else: process() # Reset task context to None. This is a guard code to avoid residual context when worker # reuse. TaskContext._setTaskContext(None) BarrierTaskContext._setTaskContext(None) except BaseException as e: try: exc_info = None if os.environ.get("SPARK_SIMPLIFIED_TRACEBACK", False): tb = try_simplify_traceback(sys.exc_info()[-1]) if tb is not None: e.__cause__ = None exc_info = "".join(traceback.format_exception(type(e), e, tb)) if exc_info is None: exc_info = traceback.format_exc() write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile) write_with_length(exc_info.encode("utf-8"), outfile) except IOError: # JVM close the socket pass except BaseException: # Write the error to stderr if it happened while serializing print("PySpark worker failed with exception:", file=sys.stderr) print(traceback.format_exc(), file=sys.stderr) sys.exit(-1) finally: if faulthandler_log_path: faulthandler.disable() faulthandler_log_file.close() os.remove(faulthandler_log_path) finish_time = time.time() report_times(outfile, boot_time, init_time, finish_time) write_long(shuffle.MemoryBytesSpilled, outfile) write_long(shuffle.DiskBytesSpilled, outfile) # Mark the beginning of the accumulators section of the output write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) write_int(len(_accumulatorRegistry), outfile) for (aid, accum) in _accumulatorRegistry.items(): pickleSer._write_with_length((aid, accum._value), outfile) # check end of stream if read_int(infile) == SpecialLengths.END_OF_STREAM: write_int(SpecialLengths.END_OF_STREAM, outfile) else: # write a different value to tell JVM to not reuse this worker write_int(SpecialLengths.END_OF_DATA_SECTION, outfile) sys.exit(-1)
def test_null_in_rdd(self): jrdd = self.sc._jvm.PythonUtils.generateRDDWithNull(self.sc._jsc) rdd = RDD(jrdd, self.sc, UTF8Deserializer()) self.assertEqual([u"a", None, u"b"], rdd.collect()) rdd = RDD(jrdd, self.sc, NoOpSerializer()) self.assertEqual([b"a", None, b"b"], rdd.collect())
def createStream(ssc, logServiceProject, logStoreName, loghubConsumerGroupName, numReceivers=None, loghubEndpoint=None, accessKeyId=None, accessKeySecret=None, cursorPosition=None, mLoghubCursorStartTime=None, forceSpecial=None, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2): """ :param ssc: StreamingContext object. :param logServiceProject: The name of `LogService` project. :param logStoreName: The name of logStore. :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group name will consumer specific logStore together. :param loghubEndpoint: The endpoint of loghub. :param numReceivers: The number of receivers. :param accessKeyId: Aliyun Access Key ID. :param accessKeySecret: Aliyun Access Key Secret. :param cursorPosition: Set user defined cursor type. :param mLoghubCursorStartTime: Set user defined cursor position (Unix Timestamp). :param forceSpecial: Whether to force to set consume position as the `mLoghubCursorStartTime`. :param storageLevel: RDD storage level. :return: A DStream object. """ try: helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \ .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper") helper = helperClass.newInstance() jlevel = ssc._sc._getJavaStorageLevel(storageLevel) if numReceivers is None: if loghubEndpoint and accessKeySecret and accessKeyId: if cursorPosition and mLoghubCursorStartTime and forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, accessKeyId, accessKeySecret, jlevel, cursorPosition, mLoghubCursorStartTime, forceSpecial) elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, accessKeyId, accessKeySecret, jlevel) else: raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time") elif not loghubEndpoint and not accessKeySecret and not accessKeyId: if cursorPosition and mLoghubCursorStartTime and forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, jlevel, cursorPosition, mLoghubCursorStartTime, forceSpecial) elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, jlevel) else: raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time") else: raise TypeError("Should provide loghubEndpoint, accessKeySecret and accessKeyId or not at the same time") else: if loghubEndpoint and accessKeySecret and accessKeyId: if cursorPosition and mLoghubCursorStartTime and forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, numReceivers, accessKeyId, accessKeySecret, jlevel, cursorPosition, mLoghubCursorStartTime, forceSpecial) elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, loghubEndpoint, numReceivers, accessKeyId, accessKeySecret, jlevel) else: raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time") elif not loghubEndpoint and not accessKeySecret and not accessKeyId: if cursorPosition and mLoghubCursorStartTime and forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, numReceivers, jlevel, cursorPosition, mLoghubCursorStartTime, forceSpecial) elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial: jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName, loghubConsumerGroupName, numReceivers, jlevel) else: raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time") else: raise TypeError("Should provide loghubEndpoint, accessKeySecret and accessKeyId or not at the same time") except Py4JJavaError as e: # TODO: use --jar once it also work on driver if 'ClassNotFoundException' in str(e.java_exception): LoghubUtils._printErrorMsg() raise e return DStream(jstream, ssc, UTF8Deserializer())