def _serialize_to_jvm(self, data, parallelism, serializer): """ Using py4j to send a large dataset to the jvm is really slow, so we use either a file or a socket if we have encryption enabled. """ if self._encryption_enabled: # with encryption, we open a server in java and send the data directly server = self._jvm.PythonParallelizeServer(self._jsc.sc(), parallelism) (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) chunked_out = ChunkedStream(sock_file, 8192) serializer.dump_stream(data, chunked_out) chunked_out.close() # this call will block until the server has read all the data and processed it (or # throws an exception) return server.getResult() else: # without encryption, we serialize to a file, and we read the file in java and # parallelize from there. tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: serializer.dump_stream(data, tempFile) tempFile.close() readRDDFromFile = self._jvm.PythonRDD.readRDDFromFile return readRDDFromFile(self._jsc, tempFile.name, parallelism) finally: # we eagerly read the file so we can delete right after. os.unlink(tempFile.name)
def _test_chunked_stream(self, data, py_buf_size): # write data using the chunked protocol from python. chunked_file = tempfile.NamedTemporaryFile(delete=False) dechunked_file = tempfile.NamedTemporaryFile(delete=False) dechunked_file.close() try: out = ChunkedStream(chunked_file, py_buf_size) out.write(data) out.close() # now try to read it in java jin = self._jvm.java.io.FileInputStream(chunked_file.name) jout = self._jvm.java.io.FileOutputStream(dechunked_file.name) self._jvm.DechunkedInputStream.dechunkAndCopyToOutput(jin, jout) # java should have decoded it back to the original data self.assertEqual(len(data), os.stat(dechunked_file.name).st_size) with open(dechunked_file.name, "rb") as f: byte = f.read(1) idx = 0 while byte: self.assertEqual(data[idx], bytearray(byte)[0], msg="idx = " + str(idx)) byte = f.read(1) idx += 1 finally: os.unlink(chunked_file.name) os.unlink(dechunked_file.name)
def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): """ Using py4j to send a large dataset to the jvm is really slow, so we use either a file or a socket if we have encryption enabled. :param data: :param serializer: :param reader_func: A function which takes a filename and reads in the data in the jvm and returns a JavaRDD. Only used when encryption is disabled. :param createRDDServer: A function which creates a PythonRDDServer in the jvm to accept the serialized data, for use when encryption is enabled. :return: """ if self._encryption_enabled: # with encryption, we open a server in java and send the data directly server = createRDDServer() (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) chunked_out = ChunkedStream(sock_file, 8192) serializer.dump_stream(data, chunked_out) chunked_out.close() # this call will block until the server has read all the data and processed it (or # throws an exception) r = server.getResult() return r else: # without encryption, we serialize to a file, and we read the file in java and # parallelize from there. tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: serializer.dump_stream(data, tempFile) tempFile.close() return reader_func(tempFile.name) finally: # we eagerily reads the file so we can delete right after. os.unlink(tempFile.name)
def _serialize_to_jvm(self, data, serializer, reader_func, createRDDServer): """ Using py4j to send a large dataset to the jvm is really slow, so we use either a file or a socket if we have encryption enabled. :param data: :param serializer: :param reader_func: A function which takes a filename and reads in the data in the jvm and returns a JavaRDD. Only used when encryption is disabled. :param createRDDServer: A function which creates a PythonRDDServer in the jvm to accept the serialized data, for use when encryption is enabled. :return: """ if self._encryption_enabled: # with encryption, we open a server in java and send the data directly server = createRDDServer() (sock_file, _) = local_connect_and_auth(server.port(), server.secret()) chunked_out = ChunkedStream(sock_file, 8192) serializer.dump_stream(data, chunked_out) chunked_out.close() # this call will block until the server has read all the data and processed it (or # throws an exception) r = server.getResult() return r else: # without encryption, we serialize to a file, and we read the file in java and # parallelize from there. tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) try: try: serializer.dump_stream(data, tempFile) finally: tempFile.close() return reader_func(tempFile.name) finally: # we eagerily reads the file so we can delete right after. os.unlink(tempFile.name)