def test_compressed_serializer(self): ser = CompressedSerializer(PickleSerializer()) from io import BytesIO as StringIO io = StringIO() ser.dump_stream(["abc", u"123", range(5)], io) io.seek(0) self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io))) ser.dump_stream(range(1000), io) io.seek(0) self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io))) io.close()
def broadcast(self, value): """ Broadcast a read-only variable to the cluster, returning a L{Broadcast<pyspark.broadcast.Broadcast>} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ ser = CompressedSerializer(PickleSerializer()) # pass large object by py4j is very slow and need much memory tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir) ser.dump_stream([value], tempFile) tempFile.close() jbroadcast = self._jvm.PythonRDD.readBroadcastFromFile(self._jsc, tempFile.name) return Broadcast(jbroadcast.id(), None, jbroadcast, self._pickled_broadcast_vars, tempFile.name)
def test_compressed_serializer(self): ser = CompressedSerializer(PickleSerializer()) try: from StringIO import StringIO except ImportError: from io import BytesIO as StringIO io = StringIO() ser.dump_stream(["abc", u"123", range(5)], io) io.seek(0) self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io))) ser.dump_stream(range(1000), io) io.seek(0) self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io))) io.close()