Exemplo n.º 1
0
 def test_compressed_serializer(self):
     ser = CompressedSerializer(PickleSerializer())
     from io import BytesIO as StringIO
     io = StringIO()
     ser.dump_stream(["abc", u"123", range(5)], io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io)))
     ser.dump_stream(range(1000), io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)] + list(range(1000)),
                      list(ser.load_stream(io)))
     io.close()
Exemplo n.º 2
0
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a
     L{Broadcast<pyspark.broadcast.Broadcast>}
     object for reading it in distributed functions. The variable will
     be sent to each cluster only once.
     """
     ser = CompressedSerializer(PickleSerializer())
     # pass large object by py4j is very slow and need much memory
     tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
     ser.dump_stream([value], tempFile)
     tempFile.close()
     jbroadcast = self._jvm.PythonRDD.readBroadcastFromFile(self._jsc, tempFile.name)
     return Broadcast(jbroadcast.id(), None, jbroadcast, self._pickled_broadcast_vars, tempFile.name)
Exemplo n.º 3
0
 def test_compressed_serializer(self):
     ser = CompressedSerializer(PickleSerializer())
     try:
         from StringIO import StringIO
     except ImportError:
         from io import BytesIO as StringIO
     io = StringIO()
     ser.dump_stream(["abc", u"123", range(5)], io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)], list(ser.load_stream(io)))
     ser.dump_stream(range(1000), io)
     io.seek(0)
     self.assertEqual(["abc", u"123", range(5)] + list(range(1000)), list(ser.load_stream(io)))
     io.close()
Exemplo n.º 4
0
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a
     L{Broadcast<pyspark.broadcast.Broadcast>}
     object for reading it in distributed functions. The variable will
     be sent to each cluster only once.
     """
     ser = CompressedSerializer(PickleSerializer())
     # pass large object by py4j is very slow and need much memory
     tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
     ser.dump_stream([value], tempFile)
     tempFile.close()
     jbroadcast = self._jvm.PythonRDD.readBroadcastFromFile(self._jsc, tempFile.name)
     return Broadcast(jbroadcast.id(), None, jbroadcast,
                      self._pickled_broadcast_vars, tempFile.name)