Exemplo n.º 1
0
 def textFile(self, name, minPartitions=None):
     """
     Read a text file from HDFS, a local file system (available on all
     nodes), or any Hadoop-supported file system URI, and return it as an
     RDD of Strings.
     
     >>> path = os.path.join(tempdir, "sample-text.txt")
     >>> with open(path, "w") as testFile:
     ...    testFile.write("Hello world!")
     >>> textFile = sc.textFile(path)
     >>> textFile.collect()
     [u'Hello world!']
     """
     minPartitions = minPartitions or min(self.defaultParallelism, 2)
     return RDD(self._jsc.textFile(name, minPartitions), self,
                UTF8Deserializer())
Exemplo n.º 2
0
    def binaryFiles(self, path, minPartitions=None):
        """
        .. note:: Experimental

        Read a directory of binary files from HDFS, a local file system
        (available on all nodes), or any Hadoop-supported file system URI
        as a byte array. Each file is read as a single record and returned
        in a key-value pair, where the key is the path of each file, the
        value is the content of each file.

        .. note:: Small files are preferred, large file is also allowable, but
            may cause bad performance.
        """
        minPartitions = minPartitions or self.defaultMinPartitions
        return RDD(self._jsc.binaryFiles(path, minPartitions), self,
                   PairDeserializer(UTF8Deserializer(), NoOpSerializer()))
Exemplo n.º 3
0
    def socketTextStream(self,
                         hostname,
                         port,
                         storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input from TCP source hostname:port. Data is received using
        a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited
        lines.

        @param hostname:      Hostname to connect to for receiving data
        @param port:          Port to connect to for receiving data
        @param storageLevel:  Storage level to use for storing the received objects
        """
        jlevel = self._sc._getJavaStorageLevel(storageLevel)
        return DStream(self._jssc.socketTextStream(hostname, port, jlevel),
                       self, UTF8Deserializer())
Exemplo n.º 4
0
    def createStream(ssc,
                     brokerUrl,
                     topic,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input stream that pulls messages from a Mqtt Broker.

        :param ssc:  StreamingContext object
        :param brokerUrl:  Url of remote mqtt publisher
        :param topic:  topic name to subscribe to
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        helper = MQTTUtils._get_helper(ssc._sc)
        jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
        return DStream(jstream, ssc, UTF8Deserializer())
Exemplo n.º 5
0
def _load_from_socket(port, auth_secret):
    """
    Load data from a given socket, this is a blocking method thus only return when the socket
    connection has been closed.

    This is copied from context.py, while modified the message protocol.
    """
    sock = None
    # Support for both IPv4 and IPv6.
    # On most of IPv6-ready systems, IPv6 will take precedence.
    for res in socket.getaddrinfo("localhost", port, socket.AF_UNSPEC,
                                  socket.SOCK_STREAM):
        af, socktype, proto, canonname, sa = res
        sock = socket.socket(af, socktype, proto)
        try:
            # Do not allow timeout for socket reading operation.
            sock.settimeout(None)
            sock.connect(sa)
        except socket.error:
            sock.close()
            sock = None
            continue
        break
    if not sock:
        raise Exception("could not open socket")

    # We don't really need a socket file here, it's just for convenience that we can reuse the
    # do_server_auth() function and data serialization methods.
    sockfile = sock.makefile("rwb", 65536)

    # Make a barrier() function call.
    write_int(BARRIER_FUNCTION, sockfile)
    sockfile.flush()

    # Do server auth.
    do_server_auth(sockfile, auth_secret)

    # Collect result.
    res = UTF8Deserializer().loads(sockfile)

    # Release resources.
    sockfile.close()
    sock.close()

    return res
Exemplo n.º 6
0
    def binaryFiles(self, path, minPartitions=None):
        """
        .. note:: Experimental

        Read a directory of binary files from HDFS, a local file system
        (available on all nodes), or any Hadoop-supported file system URI
        as a byte array. Each file is read as a single record and returned
        in a key-value pair, where the key is the path of each file, the
        value is the content of each file.
		从HDFS、或者本地文件系统(所有节点可用)、或任何hadoop支持的文件系统URI,读取一个二进制文件的目录。
		每个文件都被读取为单个记录,并以键-值对返回,其中键是每个文件的路径, value是每个文件的内容。

        .. note:: Small files are preferred, large file is also allowable, but
            may cause bad performance.
        """
        minPartitions = minPartitions or self.defaultMinPartitions#获取最小的分区数
        return RDD(self._jsc.binaryFiles(path, minPartitions), self,
                   PairDeserializer(UTF8Deserializer(), NoOpSerializer()))
Exemplo n.º 7
0
def _load_from_socket(port, auth_secret):
    """
    Load data from a given socket, this is a blocking method thus only return when the socket
    connection has been closed.
    """
    (sockfile, sock) = local_connect_and_auth(port, auth_secret)
    # The barrier() call may block forever, so no timeout
    sock.settimeout(None)
    # Make a barrier() function call.
    write_int(BARRIER_FUNCTION, sockfile)
    sockfile.flush()

    # Collect result.
    res = UTF8Deserializer().loads(sockfile)

    # Release resources.
    sockfile.close()
    sock.close()

    return res
Exemplo n.º 8
0
    def textFile(self, name, minPartitions=None, use_unicode=True):
        """
        Read a text file from HDFS, a local file system (available on all
        nodes), or any Hadoop-supported file system URI, and return it as an
        RDD of Strings.

        If use_unicode is False, the strings will be kept as `str` (encoding
        as `utf-8`), which is faster and smaller than unicode. (Added in
        Spark 1.2)

        >>> path = os.path.join(tempdir, "sample-text.txt")
        >>> with open(path, "w") as testFile:
        ...    _ = testFile.write("Hello world!")
        >>> textFile = sc.textFile(path)
        >>> textFile.collect()
        [u'Hello world!']
        """
        minPartitions = minPartitions or min(self.defaultParallelism, 2)
        return RDD(self._jsc.textFile(name, minPartitions), self,
                   UTF8Deserializer(use_unicode))
Exemplo n.º 9
0
    def load_stream(self, stream):
        """
        Load a stream of un-ordered Arrow RecordBatches, where the last iteration yields
        a list of indices that can be used to put the RecordBatches in the correct order.
        """
        # load the batches
        for batch in self.serializer.load_stream(stream):
            yield batch

        # load the batch order indices or propagate any error that occurred in the JVM
        num = read_int(stream)
        if num == -1:
            error_msg = UTF8Deserializer().loads(stream)
            raise RuntimeError("An error occurred while calling "
                               "ArrowCollectSerializer.load_stream: {}".format(error_msg))
        batch_order = []
        for i in xrange(num):
            index = read_int(stream)
            batch_order.append(index)
        yield batch_order
    def createStream(ssc, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        Create an input stream that pulls messages from a Event Hub.

        :param ssc:  StreamingContext object        
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)

        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("com.ge.predix.predixinsights.eventhub.EventHubUtilsPythonHelper")
            helper = helperClass.newInstance()
            jstream = helper.createStream(ssc._jssc, jlevel)
        except Py4JJavaError as e:
            if 'ClassNotFoundException' in str(e.java_exception):
                EHUtils._printErrorMsg(ssc.sparkContext)
            raise e

        return DStream(jstream, ssc, UTF8Deserializer())
Exemplo n.º 11
0
 def createStream(ssc, brokerUrl, topic, storageLevel=StorageLevel):
     # MEMORY_AND_DISK_SER_2
     """
     Create an input stream that pulls messages from a Mqtt Broker
     :param ssc:  StreamingContext object
     :param brokerUrl:  Url of remote mqtt publisher
     :param topic:  topic name to subscribe to
     :param storageLevel:  RDD storage level.
     :return: A DStream object
     """
     jlevel = ssc._sc._getJavaStorageLevel(storageLevel.MEMORY_AND_DISK_SER_2)
     try:
         helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
             .loadClass("org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper")
         helper = helperClass.newInstance()
         jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
     except Py4JJavaError as e:
         if 'ClassNotFoundException' in str(e.java_exception):
             MQTTUtils._printErrorMsg(ssc.sparkContext)
         raise e
     return DStream(jstream, ssc, UTF8Deserializer())
Exemplo n.º 12
0
def worker(sock, authenticated):
    """
    Called by a worker process after the fork().
    """
    signal.signal(SIGHUP, SIG_DFL)
    signal.signal(SIGCHLD, SIG_DFL)
    signal.signal(SIGTERM, SIG_DFL)
    # restore the handler for SIGINT,
    # it's useful for debugging (show the stacktrace before exit)
    signal.signal(SIGINT, signal.default_int_handler)

    # Read the socket using fdopen instead of socket.makefile() because the latter
    # seems to be very slow; note that we need to dup() the file descriptor because
    # otherwise writes also cause a seek that makes us miss data on the read side.
    buffer_size = int(os.environ.get("SPARK_BUFFER_SIZE", 65536))
    infile = os.fdopen(os.dup(sock.fileno()), "rb", buffer_size)
    outfile = os.fdopen(os.dup(sock.fileno()), "wb", buffer_size)

    if not authenticated:
        client_secret = UTF8Deserializer().loads(infile)
        if os.environ["PYTHON_WORKER_FACTORY_SECRET"] == client_secret:
            write_with_length("ok".encode("utf-8"), outfile)
            outfile.flush()
        else:
            write_with_length("err".encode("utf-8"), outfile)
            outfile.flush()
            sock.close()
            return 1

    exit_code = 0
    try:
        worker_main(infile, outfile)
    except SystemExit as exc:
        exit_code = compute_real_exit_code(exc.code)
    finally:
        try:
            outfile.flush()
        except Exception:
            pass
    return exit_code
Exemplo n.º 13
0
    def textFile(self, name, minPartitions=None, use_unicode=True):
        """
        Read a text file from HDFS, a local file system (available on all
        nodes), or any Hadoop-supported file system URI, and return it as an
        RDD of Strings.
		从HDFS、或者本地文件系统(所有节点可用)、或任何hadoop支持的文件系统URI,读取一个文本文件,并将其作为一个字符串的RDD返回。

        If use_unicode is False, the strings will be kept as `str` (encoding
        as `utf-8`), which is faster and smaller than unicode. (Added in
        Spark 1.2)
		如果use_unicode设置为False,则字符串将保持为‘str’(encoding as `utf-8`)类型,它比unicode更快、更小。(spark1.2添加的)

        >>> path = os.path.join(tempdir, "sample-text.txt")
        >>> with open(path, "w") as testFile:
        ...    _ = testFile.write("Hello world!")
        >>> textFile = sc.textFile(path)
        >>> textFile.collect()
        [u'Hello world!']
        """
        minPartitions = minPartitions or min(self.defaultParallelism, 2)#获取最小分区
        return RDD(self._jsc.textFile(name, minPartitions), self,
                   UTF8Deserializer(use_unicode))
Exemplo n.º 14
0
    def createStreams(ssc,
                      logServiceProject,
                      logStoreName,
                      loghubConsumerGroupName,
                      loghubEndpoint,
                      numReceivers,
                      accessKeyId,
                      accessKeySecret,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        :param ssc: StreamingContext object.
        :param logServiceProject: The name of `LogService` project.
        :param logStoreName: The name of logStore.
        :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group
                                       name will consumer specific logStore together.
        :param loghubEndpoint: The endpoint of loghub.
        :param numReceivers: The number of receivers.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param storageLevel: RDD storage level.
        :return: A DStream object.
        """
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            jstream = helper.createStream(ssc._jssc, logServiceProject,
                                          logStoreName,
                                          loghubConsumerGroupName,
                                          loghubEndpoint, numReceivers,
                                          accessKeyId, accessKeySecret, jlevel)

        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                LoghubUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())
Exemplo n.º 15
0
def _load_from_socket(
    port: Optional[Union[str, int]],
    auth_secret: str,
    function: int,
    all_gather_message: Optional[str] = None,
) -> List[str]:
    """
    Load data from a given socket, this is a blocking method thus only return when the socket
    connection has been closed.
    """
    (sockfile, sock) = local_connect_and_auth(port, auth_secret)

    # The call may block forever, so no timeout
    sock.settimeout(None)

    if function == BARRIER_FUNCTION:
        # Make a barrier() function call.
        write_int(function, sockfile)
    elif function == ALL_GATHER_FUNCTION:
        # Make a all_gather() function call.
        write_int(function, sockfile)
        write_with_length(
            cast(str, all_gather_message).encode("utf-8"), sockfile)
    else:
        raise ValueError("Unrecognized function type")
    sockfile.flush()

    # Collect result.
    len = read_int(sockfile)
    res = []
    for i in range(len):
        res.append(UTF8Deserializer().loads(sockfile))

    # Release resources.
    sockfile.close()
    sock.close()

    return res
Exemplo n.º 16
0
    def socketTextStream(
        self,
        hostname: str,
        port: int,
        storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_2
    ) -> "DStream[str]":
        """
        Create an input from TCP source hostname:port. Data is received using
        a TCP socket and receive byte is interpreted as UTF8 encoded ``\\n`` delimited
        lines.

        Parameters
        ----------
        hostname : str
            Hostname to connect to for receiving data
        port : int
            Port to connect to for receiving data
        storageLevel : :class:`pyspark.StorageLevel`, optional
            Storage level to use for storing the received objects
        """
        jlevel = self._sc._getJavaStorageLevel(storageLevel)
        return DStream(self._jssc.socketTextStream(hostname, port, jlevel),
                       self, UTF8Deserializer())
Exemplo n.º 17
0
    def createPullingStreamAsRawBytes(ssc, queueName, accessKeyId, accessKeySecret,
                                      endpoint, storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        :param ssc: StreamingContext object.
        :param queueName: The name of MNS queue.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param endpoint: The endpoint of MNS service.
        :param storageLevel: RDD storage level.
        :return: A DStream object.
        """
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.mns.MnsUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            jstream = helper.createPullingStreamAsRawBytes(ssc._jssc, queueName, accessKeyId, accessKeySecret, endpoint, jlevel)

        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                MnsUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())
Exemplo n.º 18
0
    def createStream(ssc,
                     brokerUrl,
                     topic,
                     storageLevel=StorageLevel.MEMORY_AND_DISK_2):
        """
        Create an input stream that pulls messages from a Mqtt Broker.

        :param ssc:  StreamingContext object
        :param brokerUrl:  Url of remote mqtt publisher
        :param topic:  topic name to subscribe to
        :param storageLevel:  RDD storage level.
        :return: A DStream object
        """
        try:
            helper = ssc._jvm.org.apache.spark.streaming.mqtt.MQTTUtilsPythonHelper(
            )
        except TypeError as e:
            if str(e) == "'JavaPackage' object is not callable":
                MQTTUtils._printErrorMsg(ssc.sparkContext)
            raise

        jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
        jstream = helper.createStream(ssc._jssc, brokerUrl, topic, jlevel)
        return DStream(jstream, ssc, UTF8Deserializer())
Exemplo n.º 19
0
def launch_gateway(conf=None):
    """
    launch jvm gateway
    :param conf: spark configuration passed to spark-submit
    :return:
    """
    if "PYSPARK_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
        gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
    else:
        SPARK_HOME = _find_spark_home()
        # Launch the Py4j gateway using Spark's run command so that we pick up the
        # proper classpath and settings from spark-env.sh
        on_windows = platform.system() == "Windows"
        script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
        command = [os.path.join(SPARK_HOME, script)]
        if conf:
            for k, v in conf.getAll():
                command += ['--conf', '%s=%s' % (k, v)]
        submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
        if os.environ.get("SPARK_TESTING"):
            submit_args = ' '.join([
                "--conf spark.ui.enabled=false",
                submit_args
            ])
        command = command + shlex.split(submit_args)

        # Create a temporary directory where the gateway server should write the connection
        # information.
        conn_info_dir = tempfile.mkdtemp()
        try:
            fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
            os.close(fd)
            os.unlink(conn_info_file)

            env = dict(os.environ)
            env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file

            # Launch the Java gateway.
            # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
            if not on_windows:
                # Don't send ctrl-c / SIGINT to the Java gateway:
                def preexec_func():
                    signal.signal(signal.SIGINT, signal.SIG_IGN)
                proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
            else:
                # preexec_fn not supported on Windows
                proc = Popen(command, stdin=PIPE, env=env)

            # Wait for the file to appear, or for the process to exit, whichever happens first.
            while not proc.poll() and not os.path.isfile(conn_info_file):
                time.sleep(0.1)

            if not os.path.isfile(conn_info_file):
                raise Exception("Java gateway process exited before sending its port number")

            with open(conn_info_file, "rb") as info:
                gateway_port = read_int(info)
                gateway_secret = UTF8Deserializer().loads(info)
        finally:
            shutil.rmtree(conn_info_dir)

        # In Windows, ensure the Java child processes do not linger after Python has exited.
        # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
        # the parent process' stdin sends an EOF). In Windows, however, this is not possible
        # because java.lang.Process reads directly from the parent process' stdin, contending
        # with any opportunity to read an EOF from the parent. Note that this is only best
        # effort and will not take effect if the python process is violently terminated.
        if on_windows:
            # In Windows, the child process here is "spark-submit.cmd", not the JVM itself
            # (because the UNIX "exec" command is not available). This means we cannot simply
            # call proc.kill(), which kills only the "spark-submit.cmd" process but not the
            # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
            # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
            def killChild():
                Popen(["cmd", "/c", "taskkill", "/f", "/t", "/pid", str(proc.pid)])
            atexit.register(killChild)

    # Connect to the gateway
    gateway = JavaGateway(
        gateway_parameters=GatewayParameters(port=gateway_port, auth_token=gateway_secret,
                                             auto_convert=True))

    # Import the classes used by PySpark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.ml.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
    # TODO(davies): move into sql
    java_import(gateway.jvm, "org.apache.spark.sql.*")
    java_import(gateway.jvm, "org.apache.spark.sql.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
    java_import(gateway.jvm, "scala.Tuple2")

    return gateway
Exemplo n.º 20
0
def main(infile, outfile):
    try:
        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            sys.exit(-1)

        version = utf8_deserializer.loads(infile)
        if version != "%d.%d" % sys.version_info[:2]:
            raise Exception(("Python in worker has different version %s than that in " +
                             "driver %s, PySpark cannot run with different minor versions." +
                             "Please check environment variables PYSPARK_PYTHON and " +
                             "PYSPARK_DRIVER_PYTHON are correctly set.") %
                            ("%d.%d" % sys.version_info[:2], version))

        # read inputs only for a barrier task
        isBarrier = read_bool(infile)
        boundPort = read_int(infile)
        secret = UTF8Deserializer().loads(infile)

        # set up memory limits
        memory_limit_mb = int(os.environ.get('PYSPARK_EXECUTOR_MEMORY_MB', "-1"))
        if memory_limit_mb > 0 and has_resource_module:
            total_memory = resource.RLIMIT_AS
            try:
                (soft_limit, hard_limit) = resource.getrlimit(total_memory)
                msg = "Current mem limits: {0} of max {1}\n".format(soft_limit, hard_limit)
                print(msg, file=sys.stderr)

                # convert to bytes
                new_limit = memory_limit_mb * 1024 * 1024

                if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit:
                    msg = "Setting mem limits to {0} of max {1}\n".format(new_limit, new_limit)
                    print(msg, file=sys.stderr)
                    resource.setrlimit(total_memory, (new_limit, new_limit))

            except (resource.error, OSError, ValueError) as e:
                # not all systems support resource limits, so warn instead of failing
                print("WARN: Failed to set memory limit: {0}\n".format(e), file=sys.stderr)

        # initialize global state
        taskContext = None
        if isBarrier:
            taskContext = BarrierTaskContext._getOrCreate()
            BarrierTaskContext._initialize(boundPort, secret)
        else:
            taskContext = TaskContext._getOrCreate()
        # read inputs for TaskContext info
        taskContext._stageId = read_int(infile)
        taskContext._partitionId = read_int(infile)
        taskContext._attemptNumber = read_int(infile)
        taskContext._taskAttemptId = read_long(infile)
        taskContext._localProperties = dict()
        for i in range(read_int(infile)):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            taskContext._localProperties[k] = v

        shuffle.MemoryBytesSpilled = 0
        shuffle.DiskBytesSpilled = 0
        _accumulatorRegistry.clear()

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        add_path(spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            add_path(os.path.join(spark_files_dir, filename))
        if sys.version > '3':
            import importlib
            importlib.invalidate_caches()

        # fetch names and values of broadcast variables
        needs_broadcast_decryption_server = read_bool(infile)
        num_broadcast_variables = read_int(infile)
        if needs_broadcast_decryption_server:
            # read the decrypted data from a server in the jvm
            port = read_int(infile)
            auth_secret = utf8_deserializer.loads(infile)
            (broadcast_sock_file, _) = local_connect_and_auth(port, auth_secret)

        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            if bid >= 0:
                if needs_broadcast_decryption_server:
                    read_bid = read_long(broadcast_sock_file)
                    assert(read_bid == bid)
                    _broadcastRegistry[bid] = \
                        Broadcast(sock_file=broadcast_sock_file)
                else:
                    path = utf8_deserializer.loads(infile)
                    _broadcastRegistry[bid] = Broadcast(path=path)

            else:
                bid = - bid - 1
                _broadcastRegistry.pop(bid)

        if needs_broadcast_decryption_server:
            broadcast_sock_file.write(b'1')
            broadcast_sock_file.close()

        _accumulatorRegistry.clear()
        eval_type = read_int(infile)
        if eval_type == PythonEvalType.NON_UDF:
            func, profiler, deserializer, serializer = read_command(pickleSer, infile)
        else:
            func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)

        init_time = time.time()

        def process():
            iterator = deserializer.load_stream(infile)
            serializer.dump_stream(func(split_index, iterator), outfile)

        if profiler:
            profiler.profile(process)
        else:
            process()
    except Exception:
        try:
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc().encode("utf-8"), outfile)
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print("PySpark worker failed with exception:", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    write_long(shuffle.MemoryBytesSpilled, outfile)
    write_long(shuffle.DiskBytesSpilled, outfile)

    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)

    # check end of stream
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
        write_int(SpecialLengths.END_OF_STREAM, outfile)
    else:
        # write a different value to tell JVM to not reuse this worker
        write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
        sys.exit(-1)
Exemplo n.º 21
0
from pyspark.java_gateway import local_connect_and_auth
from pyspark.taskcontext import BarrierTaskContext, TaskContext
from pyspark.files import SparkFiles
from pyspark.rdd import PythonEvalType
from pyspark.serializers import write_with_length, write_int, read_long, read_bool, \
    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer, \
    BatchedSerializer, ArrowStreamPandasSerializer
from pyspark.sql.types import to_arrow_type, StructType
from pyspark.util import _get_argspec, fail_on_stopiteration
from pyspark import shuffle

if sys.version >= '3':
    basestring = str

pickleSer = PickleSerializer()
utf8_deserializer = UTF8Deserializer()


def report_times(outfile, boot, init, finish):
    write_int(SpecialLengths.TIMING_DATA, outfile)
    write_long(int(1000 * boot), outfile)
    write_long(int(1000 * init), outfile)
    write_long(int(1000 * finish), outfile)


def add_path(path):
    # worker can be used, so donot add path multiple times
    if path not in sys.path:
        # overwrite system packages
        sys.path.insert(1, path)
Exemplo n.º 22
0
    def createStreams(ssc,
                      logServiceProject,
                      logStoreName,
                      loghubConsumerGroupName,
                      loghubEndpoint,
                      numReceivers,
                      accessKeyId,
                      accessKeySecret,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        '''

        :param ssc: StreamingContext 对象
        :param logServiceProject: LogService的项目名
        :param logStoreName: logStore名称
        :param loghubConsumerGroupName: The group name of loghub consumer
        :param loghubEndpoint:
        :param numReceivers:
        :param accessKeyId:
        :param accessKeySecret:
        :param storageLevel: RDD storage level
        :return: A DStream Object
        '''
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            jstream = helper.createStream(ssc._jssc, logServiceProject,
                                          logStoreName,
                                          loghubConsumerGroupName,
                                          loghubEndpoint, numReceivers,
                                          accessKeyId, accessKeySecret, jlevel)

        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                LoghubUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())

        @staticmethod
        def createStream(ssc,
                         logServiceProject,
                         logStoreName,
                         loghubConsumerGroupName,
                         loghubEndpoint,
                         accessKeyId,
                         accessKeySecret,
                         storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
            """
            :param ssc: StreamingContext object.
            :param logServiceProject: The name of `LogService` project.
            :param logStoreName: The name of logStore.
            :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group
                                           name will consumer specific logStore together.
            :param loghubEndpoint: The endpoint of loghub.
            :param numReceivers: The number of receivers.
            :param accessKeyId: Aliyun Access Key ID.
            :param accessKeySecret: Aliyun Access Key Secret.s
            :param storageLevel: RDD storage level.
            :return: A DStream object.
            """
            try:
                helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                    .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper")
                helper = helperClass.newInstance()
                jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
                jstream = helper.createStream(ssc._jssc, logServiceProject,
                                              logStoreName,
                                              loghubConsumerGroupName,
                                              loghubEndpoint, accessKeyId,
                                              accessKeySecret, jlevel)

            except Py4JJavaError as e:
                # TODO: use --jar once it also work on driver
                if 'ClassNotFoundException' in str(e.java_exception):
                    LoghubUtils._printErrorMsg()
                raise e
            return DStream(jstream, ssc, UTF8Deserializer())

        @staticmethod
        def _printErrorMsg():
            print("""
        ________________________________________________________________________________________________
          E-MapReduce SDK's libraries not found in class path. Try one of the following.
          1. Include the 'emr-logservice_2.10' library and its dependencies with in the
             spark-submit command as
             $ bin/spark-submit --packages com.aliyun.emr:emr-logservice_2.10:%s ...
          2. Download the JAR of the artifact from Maven Central http://search.maven.org/,
             Group Id = com.aliyun.emr, Artifact Id = emr-logservice_2.10, Version = %s.
             Then, include the jar in the spark-submit command as
             $ bin/spark-submit --jars <emr-logservice_2.10-%s.jar> ...
        ________________________________________________________________________________________________
        """ % ('1.4.2-SNAPSHOT', '1.4.2-SNAPSHOT', '1.4.2-SNAPSHOT'))
Exemplo n.º 23
0
def launch_gateway(conf=None, popen_kwargs=None):
    """
    launch jvm gateway

    Parameters
    ----------
    conf : :py:class:`pyspark.SparkConf`
        spark configuration passed to spark-submit
    popen_kwargs : dict
        Dictionary of kwargs to pass to Popen when spawning
        the py4j JVM. This is a developer feature intended for use in
        customizing how pyspark interacts with the py4j JVM (e.g., capturing
        stdout/stderr).

    Returns
    -------
    ClientServer or JavaGateway
    """
    if "PYSPARK_GATEWAY_PORT" in os.environ:
        gateway_port = int(os.environ["PYSPARK_GATEWAY_PORT"])
        gateway_secret = os.environ["PYSPARK_GATEWAY_SECRET"]
        # Process already exists
        proc = None
    else:
        SPARK_HOME = _find_spark_home()
        # Launch the Py4j gateway using Spark's run command so that we pick up the
        # proper classpath and settings from spark-env.sh
        on_windows = platform.system() == "Windows"
        script = "./bin/spark-submit.cmd" if on_windows else "./bin/spark-submit"
        command = [os.path.join(SPARK_HOME, script)]
        if conf:
            for k, v in conf.getAll():
                command += ["--conf", "%s=%s" % (k, v)]
        submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell")
        if os.environ.get("SPARK_TESTING"):
            submit_args = " ".join(
                ["--conf spark.ui.enabled=false", submit_args])
        command = command + shlex.split(submit_args)

        # Create a temporary directory where the gateway server should write the connection
        # information.
        conn_info_dir = tempfile.mkdtemp()
        try:
            fd, conn_info_file = tempfile.mkstemp(dir=conn_info_dir)
            os.close(fd)
            os.unlink(conn_info_file)

            env = dict(os.environ)
            env["_PYSPARK_DRIVER_CONN_INFO_PATH"] = conn_info_file

            # Launch the Java gateway.
            popen_kwargs = {} if popen_kwargs is None else popen_kwargs
            # We open a pipe to stdin so that the Java gateway can die when the pipe is broken
            popen_kwargs["stdin"] = PIPE
            # We always set the necessary environment variables.
            popen_kwargs["env"] = env
            if not on_windows:
                # Don't send ctrl-c / SIGINT to the Java gateway:
                def preexec_func():
                    signal.signal(signal.SIGINT, signal.SIG_IGN)

                popen_kwargs["preexec_fn"] = preexec_func
                proc = Popen(command, **popen_kwargs)
            else:
                # preexec_fn not supported on Windows
                proc = Popen(command, **popen_kwargs)

            # Wait for the file to appear, or for the process to exit, whichever happens first.
            while not proc.poll() and not os.path.isfile(conn_info_file):
                time.sleep(0.1)

            if not os.path.isfile(conn_info_file):
                raise RuntimeError(
                    "Java gateway process exited before sending its port number"
                )

            with open(conn_info_file, "rb") as info:
                gateway_port = read_int(info)
                gateway_secret = UTF8Deserializer().loads(info)
        finally:
            shutil.rmtree(conn_info_dir)

        # In Windows, ensure the Java child processes do not linger after Python has exited.
        # In UNIX-based systems, the child process can kill itself on broken pipe (i.e. when
        # the parent process' stdin sends an EOF). In Windows, however, this is not possible
        # because java.lang.Process reads directly from the parent process' stdin, contending
        # with any opportunity to read an EOF from the parent. Note that this is only best
        # effort and will not take effect if the python process is violently terminated.
        if on_windows:
            # In Windows, the child process here is "spark-submit.cmd", not the JVM itself
            # (because the UNIX "exec" command is not available). This means we cannot simply
            # call proc.kill(), which kills only the "spark-submit.cmd" process but not the
            # JVMs. Instead, we use "taskkill" with the tree-kill option "/t" to terminate all
            # child processes in the tree (http://technet.microsoft.com/en-us/library/bb491009.aspx)
            def killChild():
                Popen([
                    "cmd", "/c", "taskkill", "/f", "/t", "/pid",
                    str(proc.pid)
                ])

            atexit.register(killChild)

    # Connect to the gateway (or client server to pin the thread between JVM and Python)
    if os.environ.get("PYSPARK_PIN_THREAD", "true").lower() == "true":
        gateway = ClientServer(
            java_parameters=JavaParameters(port=gateway_port,
                                           auth_token=gateway_secret,
                                           auto_convert=True),
            python_parameters=PythonParameters(port=0, eager_load=False),
        )
    else:
        gateway = JavaGateway(gateway_parameters=GatewayParameters(
            port=gateway_port, auth_token=gateway_secret, auto_convert=True))

    # Store a reference to the Popen object for use by the caller (e.g., in reading stdout/stderr)
    gateway.proc = proc

    # Import the classes used by PySpark
    java_import(gateway.jvm, "org.apache.spark.SparkConf")
    java_import(gateway.jvm, "org.apache.spark.api.java.*")
    java_import(gateway.jvm, "org.apache.spark.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.ml.python.*")
    java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.resource.*")
    # TODO(davies): move into sql
    java_import(gateway.jvm, "org.apache.spark.sql.*")
    java_import(gateway.jvm, "org.apache.spark.sql.api.python.*")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
    java_import(gateway.jvm, "scala.Tuple2")

    return gateway
Exemplo n.º 24
0
def main(infile, outfile):
    try:
        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            sys.exit(-1)

        version = utf8_deserializer.loads(infile)
        if version != "%d.%d" % sys.version_info[:2]:
            raise Exception(
                ("Python in worker has different version %s than that in " +
                 "driver %s, PySpark cannot run with different minor versions."
                 + "Please check environment variables PYSPARK_PYTHON and " +
                 "PYSPARK_DRIVER_PYTHON are correctly set.") %
                ("%d.%d" % sys.version_info[:2], version))

        # read inputs only for a barrier task
        isBarrier = read_bool(infile)
        boundPort = read_int(infile)
        secret = UTF8Deserializer().loads(infile)
        # initialize global state
        taskContext = None
        if isBarrier:
            taskContext = BarrierTaskContext._getOrCreate()
            BarrierTaskContext._initialize(boundPort, secret)
        else:
            taskContext = TaskContext._getOrCreate()
        # read inputs for TaskContext info
        taskContext._stageId = read_int(infile)
        taskContext._partitionId = read_int(infile)
        taskContext._attemptNumber = read_int(infile)
        taskContext._taskAttemptId = read_long(infile)
        taskContext._localProperties = dict()
        for i in range(read_int(infile)):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            taskContext._localProperties[k] = v

        shuffle.MemoryBytesSpilled = 0
        shuffle.DiskBytesSpilled = 0
        _accumulatorRegistry.clear()

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        add_path(
            spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            add_path(os.path.join(spark_files_dir, filename))
        if sys.version > '3':
            import importlib
            importlib.invalidate_caches()

        # fetch names and values of broadcast variables
        num_broadcast_variables = read_int(infile)
        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            if bid >= 0:
                path = utf8_deserializer.loads(infile)
                _broadcastRegistry[bid] = Broadcast(path=path)
            else:
                bid = -bid - 1
                _broadcastRegistry.pop(bid)

        _accumulatorRegistry.clear()
        eval_type = read_int(infile)
        if eval_type == PythonEvalType.NON_UDF:
            func, profiler, deserializer, serializer = read_command(
                pickleSer, infile)
        else:
            func, profiler, deserializer, serializer = read_udfs(
                pickleSer, infile, eval_type)

        init_time = time.time()

        def process():
            iterator = deserializer.load_stream(infile)
            serializer.dump_stream(func(split_index, iterator), outfile)

        if profiler:
            profiler.profile(process)
        else:
            process()
    except Exception:
        try:
            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(traceback.format_exc().encode("utf-8"), outfile)
        except IOError:
            # JVM close the socket
            pass
        except Exception:
            # Write the error to stderr if it happened while serializing
            print("PySpark worker failed with exception:", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    write_long(shuffle.MemoryBytesSpilled, outfile)
    write_long(shuffle.DiskBytesSpilled, outfile)

    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)

    # check end of stream
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
        write_int(SpecialLengths.END_OF_STREAM, outfile)
    else:
        # write a different value to tell JVM to not reuse this worker
        write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
        sys.exit(-1)
Exemplo n.º 25
0
def main(infile, outfile):
    faulthandler_log_path = os.environ.get("PYTHON_FAULTHANDLER_DIR", None)
    try:
        if faulthandler_log_path:
            faulthandler_log_path = os.path.join(faulthandler_log_path, str(os.getpid()))
            faulthandler_log_file = open(faulthandler_log_path, "w")
            faulthandler.enable(file=faulthandler_log_file)

        boot_time = time.time()
        split_index = read_int(infile)
        if split_index == -1:  # for unit tests
            sys.exit(-1)

        version = utf8_deserializer.loads(infile)
        if version != "%d.%d" % sys.version_info[:2]:
            raise RuntimeError(
                (
                    "Python in worker has different version %s than that in "
                    + "driver %s, PySpark cannot run with different minor versions. "
                    + "Please check environment variables PYSPARK_PYTHON and "
                    + "PYSPARK_DRIVER_PYTHON are correctly set."
                )
                % ("%d.%d" % sys.version_info[:2], version)
            )

        # read inputs only for a barrier task
        isBarrier = read_bool(infile)
        boundPort = read_int(infile)
        secret = UTF8Deserializer().loads(infile)

        # set up memory limits
        memory_limit_mb = int(os.environ.get("PYSPARK_EXECUTOR_MEMORY_MB", "-1"))
        if memory_limit_mb > 0 and has_resource_module:
            total_memory = resource.RLIMIT_AS
            try:
                (soft_limit, hard_limit) = resource.getrlimit(total_memory)
                msg = "Current mem limits: {0} of max {1}\n".format(soft_limit, hard_limit)
                print(msg, file=sys.stderr)

                # convert to bytes
                new_limit = memory_limit_mb * 1024 * 1024

                if soft_limit == resource.RLIM_INFINITY or new_limit < soft_limit:
                    msg = "Setting mem limits to {0} of max {1}\n".format(new_limit, new_limit)
                    print(msg, file=sys.stderr)
                    resource.setrlimit(total_memory, (new_limit, new_limit))

            except (resource.error, OSError, ValueError) as e:
                # not all systems support resource limits, so warn instead of failing
                lineno = (
                    getframeinfo(currentframe()).lineno + 1 if currentframe() is not None else 0
                )
                print(
                    warnings.formatwarning(
                        "Failed to set memory limit: {0}".format(e),
                        ResourceWarning,
                        __file__,
                        lineno,
                    ),
                    file=sys.stderr,
                )

        # initialize global state
        taskContext = None
        if isBarrier:
            taskContext = BarrierTaskContext._getOrCreate()
            BarrierTaskContext._initialize(boundPort, secret)
            # Set the task context instance here, so we can get it by TaskContext.get for
            # both TaskContext and BarrierTaskContext
            TaskContext._setTaskContext(taskContext)
        else:
            taskContext = TaskContext._getOrCreate()
        # read inputs for TaskContext info
        taskContext._stageId = read_int(infile)
        taskContext._partitionId = read_int(infile)
        taskContext._attemptNumber = read_int(infile)
        taskContext._taskAttemptId = read_long(infile)
        taskContext._cpus = read_int(infile)
        taskContext._resources = {}
        for r in range(read_int(infile)):
            key = utf8_deserializer.loads(infile)
            name = utf8_deserializer.loads(infile)
            addresses = []
            taskContext._resources = {}
            for a in range(read_int(infile)):
                addresses.append(utf8_deserializer.loads(infile))
            taskContext._resources[key] = ResourceInformation(name, addresses)

        taskContext._localProperties = dict()
        for i in range(read_int(infile)):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            taskContext._localProperties[k] = v

        shuffle.MemoryBytesSpilled = 0
        shuffle.DiskBytesSpilled = 0
        _accumulatorRegistry.clear()

        # fetch name of workdir
        spark_files_dir = utf8_deserializer.loads(infile)
        SparkFiles._root_directory = spark_files_dir
        SparkFiles._is_running_on_worker = True

        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
        add_path(spark_files_dir)  # *.py files that were added will be copied here
        num_python_includes = read_int(infile)
        for _ in range(num_python_includes):
            filename = utf8_deserializer.loads(infile)
            add_path(os.path.join(spark_files_dir, filename))

        importlib.invalidate_caches()

        # fetch names and values of broadcast variables
        needs_broadcast_decryption_server = read_bool(infile)
        num_broadcast_variables = read_int(infile)
        if needs_broadcast_decryption_server:
            # read the decrypted data from a server in the jvm
            port = read_int(infile)
            auth_secret = utf8_deserializer.loads(infile)
            (broadcast_sock_file, _) = local_connect_and_auth(port, auth_secret)

        for _ in range(num_broadcast_variables):
            bid = read_long(infile)
            if bid >= 0:
                if needs_broadcast_decryption_server:
                    read_bid = read_long(broadcast_sock_file)
                    assert read_bid == bid
                    _broadcastRegistry[bid] = Broadcast(sock_file=broadcast_sock_file)
                else:
                    path = utf8_deserializer.loads(infile)
                    _broadcastRegistry[bid] = Broadcast(path=path)

            else:
                bid = -bid - 1
                _broadcastRegistry.pop(bid)

        if needs_broadcast_decryption_server:
            broadcast_sock_file.write(b"1")
            broadcast_sock_file.close()

        _accumulatorRegistry.clear()
        eval_type = read_int(infile)
        if eval_type == PythonEvalType.NON_UDF:
            func, profiler, deserializer, serializer = read_command(pickleSer, infile)
        else:
            func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)

        init_time = time.time()

        def process():
            iterator = deserializer.load_stream(infile)
            out_iter = func(split_index, iterator)
            try:
                serializer.dump_stream(out_iter, outfile)
            finally:
                if hasattr(out_iter, "close"):
                    out_iter.close()

        if profiler:
            profiler.profile(process)
        else:
            process()

        # Reset task context to None. This is a guard code to avoid residual context when worker
        # reuse.
        TaskContext._setTaskContext(None)
        BarrierTaskContext._setTaskContext(None)
    except BaseException as e:
        try:
            exc_info = None
            if os.environ.get("SPARK_SIMPLIFIED_TRACEBACK", False):
                tb = try_simplify_traceback(sys.exc_info()[-1])
                if tb is not None:
                    e.__cause__ = None
                    exc_info = "".join(traceback.format_exception(type(e), e, tb))
            if exc_info is None:
                exc_info = traceback.format_exc()

            write_int(SpecialLengths.PYTHON_EXCEPTION_THROWN, outfile)
            write_with_length(exc_info.encode("utf-8"), outfile)
        except IOError:
            # JVM close the socket
            pass
        except BaseException:
            # Write the error to stderr if it happened while serializing
            print("PySpark worker failed with exception:", file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
        sys.exit(-1)
    finally:
        if faulthandler_log_path:
            faulthandler.disable()
            faulthandler_log_file.close()
            os.remove(faulthandler_log_path)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    write_long(shuffle.MemoryBytesSpilled, outfile)
    write_long(shuffle.DiskBytesSpilled, outfile)

    # Mark the beginning of the accumulators section of the output
    write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
    write_int(len(_accumulatorRegistry), outfile)
    for (aid, accum) in _accumulatorRegistry.items():
        pickleSer._write_with_length((aid, accum._value), outfile)

    # check end of stream
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
        write_int(SpecialLengths.END_OF_STREAM, outfile)
    else:
        # write a different value to tell JVM to not reuse this worker
        write_int(SpecialLengths.END_OF_DATA_SECTION, outfile)
        sys.exit(-1)
Exemplo n.º 26
0
 def test_null_in_rdd(self):
     jrdd = self.sc._jvm.PythonUtils.generateRDDWithNull(self.sc._jsc)
     rdd = RDD(jrdd, self.sc, UTF8Deserializer())
     self.assertEqual([u"a", None, u"b"], rdd.collect())
     rdd = RDD(jrdd, self.sc, NoOpSerializer())
     self.assertEqual([b"a", None, b"b"], rdd.collect())
Exemplo n.º 27
0
    def createStream(ssc, logServiceProject, logStoreName, loghubConsumerGroupName,
                      numReceivers=None, loghubEndpoint=None, accessKeyId=None, accessKeySecret=None,
                      cursorPosition=None, mLoghubCursorStartTime=None, forceSpecial=None,
                      storageLevel=StorageLevel.MEMORY_AND_DISK_SER_2):
        """
        :param ssc: StreamingContext object.
        :param logServiceProject: The name of `LogService` project.
        :param logStoreName: The name of logStore.
        :param loghubConsumerGroupName: The group name of loghub consumer. All consumer process which has the same group
                                       name will consumer specific logStore together.
        :param loghubEndpoint: The endpoint of loghub.
        :param numReceivers: The number of receivers.
        :param accessKeyId: Aliyun Access Key ID.
        :param accessKeySecret: Aliyun Access Key Secret.
        :param cursorPosition: Set user defined cursor type.
        :param mLoghubCursorStartTime: Set user defined cursor position (Unix Timestamp).
        :param forceSpecial: Whether to force to set consume position as the `mLoghubCursorStartTime`.
        :param storageLevel: RDD storage level.
        :return: A DStream object.
        """
        try:
            helperClass = ssc._jvm.java.lang.Thread.currentThread().getContextClassLoader() \
                .loadClass("org.apache.spark.streaming.aliyun.logservice.LoghubUtilsHelper")
            helper = helperClass.newInstance()
            jlevel = ssc._sc._getJavaStorageLevel(storageLevel)
            if numReceivers is None:
                if loghubEndpoint and accessKeySecret and accessKeyId:
                    if cursorPosition and mLoghubCursorStartTime and forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                            loghubConsumerGroupName, loghubEndpoint,
                                            accessKeyId, accessKeySecret, jlevel, cursorPosition,
                                            mLoghubCursorStartTime, forceSpecial)
                    elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                              loghubConsumerGroupName, loghubEndpoint,
                                              accessKeyId, accessKeySecret, jlevel)
                    else:
                        raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time")
                elif not loghubEndpoint and not accessKeySecret and not accessKeyId:
                    if cursorPosition and mLoghubCursorStartTime and forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                                      loghubConsumerGroupName, jlevel, cursorPosition,
                                                      mLoghubCursorStartTime, forceSpecial)
                    elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                                      loghubConsumerGroupName, jlevel)
                    else:
                        raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time")
                else:
                    raise TypeError("Should provide loghubEndpoint, accessKeySecret and accessKeyId or not at the same time")
            else:
                if loghubEndpoint and accessKeySecret and accessKeyId:
                    if cursorPosition and mLoghubCursorStartTime and forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                                      loghubConsumerGroupName, loghubEndpoint, numReceivers,
                                                      accessKeyId, accessKeySecret, jlevel, cursorPosition,
                                                      mLoghubCursorStartTime, forceSpecial)
                    elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                                      loghubConsumerGroupName, loghubEndpoint, numReceivers,
                                                      accessKeyId, accessKeySecret, jlevel)
                    else:
                        raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time")
                elif not loghubEndpoint and not accessKeySecret and not accessKeyId:
                    if cursorPosition and mLoghubCursorStartTime and forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                                      loghubConsumerGroupName, numReceivers, jlevel,
                                                      cursorPosition, mLoghubCursorStartTime, forceSpecial)
                    elif not cursorPosition and not mLoghubCursorStartTime and not forceSpecial:
                        jstream = helper.createStream(ssc._jssc, logServiceProject, logStoreName,
                                                      loghubConsumerGroupName, numReceivers, jlevel)
                    else:
                        raise TypeError("Should provide cursorPosition, mLoghubCursorStartTime, forceSpecial or not at the same time")
                else:
                    raise TypeError("Should provide loghubEndpoint, accessKeySecret and accessKeyId or not at the same time")


        except Py4JJavaError as e:
            # TODO: use --jar once it also work on driver
            if 'ClassNotFoundException' in str(e.java_exception):
                LoghubUtils._printErrorMsg()
            raise e
        return DStream(jstream, ssc, UTF8Deserializer())