예제 #1
0
파일: context.py 프로젝트: fanzhen/spark
 def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer):
     """
     Using py4j to send a large dataset to the jvm is really slow, so we use either a file
     or a socket if we have encryption enabled.
     :param data:
     :param serializer:
     :param reader_func:  A function which takes a filename and reads in the data in the jvm and
             returns a JavaRDD. Only used when encryption is disabled.
     :param createRDDServer:  A function which creates a PythonRDDServer in the jvm to
            accept the serialized data, for use when encryption is enabled.
     :return:
     """
     if self._encryption_enabled:
         # with encryption, we open a server in java and send the data directly
         server = createRDDServer()
         (sock_file, _) = local_connect_and_auth(server.port(), server.secret())
         chunked_out = ChunkedStream(sock_file, 8192)
         serializer.dump_stream(data, chunked_out)
         chunked_out.close()
         # this call will block until the server has read all the data and processed it (or
         # throws an exception)
         r = server.getResult()
         return r
     else:
         # without encryption, we serialize to a file, and we read the file in java and
         # parallelize from there.
         tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
         try:
             serializer.dump_stream(data, tempFile)
             tempFile.close()
             return reader_func(tempFile.name)
         finally:
             # we eagerily reads the file so we can delete right after.
             os.unlink(tempFile.name)
예제 #2
0
 def _test_chunked_stream(self, data, py_buf_size):
     # write data using the chunked protocol from python.
     chunked_file = tempfile.NamedTemporaryFile(delete=False)
     dechunked_file = tempfile.NamedTemporaryFile(delete=False)
     dechunked_file.close()
     try:
         out = ChunkedStream(chunked_file, py_buf_size)
         out.write(data)
         out.close()
         # now try to read it in java
         jin = self._jvm.java.io.FileInputStream(chunked_file.name)
         jout = self._jvm.java.io.FileOutputStream(dechunked_file.name)
         self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout)
         # java should have decoded it back to the original data
         self.assertEqual(len(data), os.stat(dechunked_file.name).st_size)
         with open(dechunked_file.name, "rb") as f:
             byte = f.read(1)
             idx = 0
             while byte:
                 self.assertEqual(data[idx], bytearray(byte)[0], msg="idx = " + str(idx))
                 byte = f.read(1)
                 idx += 1
     finally:
         os.unlink(chunked_file.name)
         os.unlink(dechunked_file.name)
예제 #3
0
 def __init__(self,
              sc=None,
              value=None,
              pickle_registry=None,
              path=None,
              sock_file=None):
     """
     Should not be called directly by users -- use L{SparkContext.broadcast()}
     instead.
     """
     if sc is not None:
         # we're on the driver.  We want the pickled data to end up in a file (maybe encrypted)
         f = NamedTemporaryFile(delete=False, dir=sc._temp_dir)
         self._path = f.name
         self._sc = sc
         self._python_broadcast = sc._jvm.PythonRDD.setupBroadcast(
             self._path)
         if sc._encryption_enabled:
             # with encryption, we ask the jvm to do the encryption for us, we send it data
             # over a socket
             port, auth_secret = self._python_broadcast.setupEncryptionServer(
             )
             (encryption_sock_file,
              _) = local_connect_and_auth(port, auth_secret)
             broadcast_out = ChunkedStream(encryption_sock_file, 8192)
         else:
             # no encryption, we can just write pickled data directly to the file from python
             broadcast_out = f
         self.dump(value, broadcast_out)
         if sc._encryption_enabled:
             self._python_broadcast.waitTillDataReceived()
         self._jbroadcast = sc._jsc.broadcast(self._python_broadcast)
         self._pickle_registry = pickle_registry
     else:
         # we're on an executor
         self._jbroadcast = None
         self._sc = None
         self._python_broadcast = None
         if sock_file is not None:
             # the jvm is doing decryption for us.  Read the value
             # immediately from the sock_file
             self._value = self.load(sock_file)
         else:
             # the jvm just dumps the pickled data in path -- we'll unpickle lazily when
             # the value is requested
             assert (path is not None)
             self._path = path