def _jdstream(self): if self._jdstream_val: return self._jdstream_val if self._bypass_serializer: self.jrdd_deserializer = NoOpSerializer() command = (self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if pickled_command > (1 << 20): # 1M broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_tag = self._prev_jdstream.classTag() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) python_dstream = self.ctx._jvm.PythonDStream(self._prev_jdstream.dstream(), bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_tag) self._jdstream_val = python_dstream.asJavaDStream() return self._jdstream_val
def jvertex_rdd(self): if self.jvrdd_val: return self.jvrdd_val if self.bypass_serializer: self.jvertex_rdd_deserializer = NoOpSerializer() # enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" # profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None command = (self.func, profileStats, self.prev_jvertex_rdd_deserializer, self.jvertex_rdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M self.broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(self.broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) java_storage_level = self.ctx._getJavaStorageLevel( StorageLevel.MEMORY_ONLY) python_rdd = self.ctx._jvm.PythonVertexRDD( self.prev_jvertex_rdd, bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, java_storage_level) self.jvrdd_val = python_rdd.asJavaVertexRDD() if enable_profile: self.id = self.jvrdd_val.id() self.ctx._add_profile(self.id, profileStats) return self.jvrdd_val
def _jdstream(self): if self._jdstream_val: return self._jdstream_val if self._bypass_serializer: self.jrdd_deserializer = NoOpSerializer() command = (self.func, self._prev_jrdd_deserializer, self._jrdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if pickled_command > (1 << 20): # 1M broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() class_tag = self._prev_jdstream.classTag() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) python_dstream = self.ctx._jvm.PythonDStream( self._prev_jdstream.dstream(), bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, class_tag) self._jdstream_val = python_dstream.asJavaDStream() return self._jdstream_val
def test_itemgetter(self): from operator import itemgetter ser = CloudPickleSerializer() d = range(10) getter = itemgetter(1) getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d)) getter = itemgetter(0, 3) getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d))
def test_func_globals(self): class Unpicklable: def __reduce__(self): raise RuntimeError("not picklable") global exit exit = Unpicklable() ser = CloudPickleSerializer() self.assertRaises(Exception, lambda: ser.dumps(exit)) def foo(): sys.exit(0) self.assertTrue("exit" in foo.__code__.co_names) ser.dumps(foo)
def test_function_module_name(self): ser = CloudPickleSerializer() def func(x): return x func2 = ser.loads(ser.dumps(func)) self.assertEqual(func.__module__, func2.__module__)
def test_func_globals(self): class Unpicklable(object): def __reduce__(self): raise Exception("not picklable") global exit exit = Unpicklable() ser = CloudPickleSerializer() self.assertRaises(Exception, lambda: ser.dumps(exit)) def foo(): sys.exit(0) self.assertTrue("exit" in foo.__code__.co_names) ser.dumps(foo)
def test_pickling_file_handles(self): # to be corrected with SPARK-11160 try: import xmlrunner except ImportError: ser = CloudPickleSerializer() out1 = sys.stderr out2 = ser.loads(ser.dumps(out1)) self.assertEqual(out1, out2)
def test_pickling_file_handles(self): # to be corrected with SPARK-11160 try: import xmlrunner # type: ignore[import] # noqa: F401 except ImportError: ser = CloudPickleSerializer() out1 = sys.stderr out2 = ser.loads(ser.dumps(out1)) self.assertEqual(out1, out2)
def tf_dataset(cls, func, total_size, bigdl_type="float"): """ :param func: a function return a tensorflow dataset :param total_size: total size of this dataset :param bigdl_type: numeric type :return: A feature set """ func = CloudPickleSerializer.dumps(CloudPickleSerializer, func) jvalue = callZooFunc(bigdl_type, "createFeatureSetFromTfDataset", func, total_size) return cls(jvalue=jvalue)
def tensorflowGridSearch(): """ Wrap tensorflow so that it can be used in sklearn GridsearchCV :return: """ dataX, dataY = getMnist() dataX = dataX.astype(np.float32) dataY = np.argmax(dataY, axis=1).astype(np.int32) tuned_parameters = [{'lr': [1e-1, 1e-2], 'iters': [10, 20]}] scores = ['precision', 'recall'] model = DisLRModel(400, 10, 0.01, 10) clf = GridSearchCV(model, param_grid=tuned_parameters, cv=2, scoring='%s_macro' % "precision") clf.fit(dataX, dataY) # test whether the model could be serialized cp = CloudPickleSerializer() cp.dumps(model)
def from_pytorch(model): """ Create a TorchNet directly from PyTorch model, e.g. model in torchvision.models. :param model: a PyTorch model """ weights = [] for param in trainable_param(model): weights.append(param.view(-1)) flatten_weight = torch.nn.utils.parameters_to_vector( weights).data.numpy() bys = CloudPickleSerializer.dumps(CloudPickleSerializer, model) net = TorchModel(bys, flatten_weight) return net
def test_attrgetter(self): from operator import attrgetter ser = CloudPickleSerializer() class C(object): def __getattr__(self, item): return item d = C() getter = attrgetter("a") getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d)) getter = attrgetter("a", "b") getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d)) d.e = C() getter = attrgetter("e.a") getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d)) getter = attrgetter("e.a", "e.b") getter2 = ser.loads(ser.dumps(getter)) self.assertEqual(getter(d), getter2(d))
def pytorch_dataloader(cls, dataloader, features="_data[0]", labels="_data[1]", bigdl_type="float"): """ Create FeatureSet from pytorch dataloader :param dataloader: a pytorch dataloader, or a function return pytorch dataloader. :param features: features in _data, _data is get from dataloader. :param labels: lables in _data, _data is get from dataloader. :param bigdl_type: numeric type :return: A feature set """ import torch if isinstance(dataloader, torch.utils.data.DataLoader): node_num, core_num = get_node_and_core_number() if dataloader.batch_size % node_num != 0: true_bs = math.ceil( dataloader.batch_size / node_num) * node_num warning_msg = "Detect dataloader's batch_size is not divisible by node number(" + \ str(node_num) + "), will adjust batch_size to " + str(true_bs) + \ " automatically" warnings.warn(warning_msg) bys = CloudPickleSerializer.dumps(CloudPickleSerializer, dataloader) jvalue = callZooFunc(bigdl_type, "createFeatureSetFromPyTorch", bys, False, features, labels) return cls(jvalue=jvalue) elif callable(dataloader): bys = CloudPickleSerializer.dumps(CloudPickleSerializer, dataloader) jvalue = callZooFunc(bigdl_type, "createFeatureSetFromPyTorch", bys, True, features, labels) return cls(jvalue=jvalue) else: raise ValueError( "Unsupported dataloader type, please pass pytorch dataloader" + " or a function to create pytorch dataloader.")
def getJavaVertexRDD(self, rdd, rdd_deserializer): if self.bypass_serializer: self.jvertex_rdd_deserializer = NoOpSerializer() rdd_deserializer = NoOpSerializer() # enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" # profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None def f(index, iterator): return iterator command = (f, rdd_deserializer, rdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M self.broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(self.broadcast) # the serialized command will be compressed by broadcast broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) target_storage_level = StorageLevel.MEMORY_ONLY java_storage_level = self.ctx._getJavaStorageLevel(target_storage_level) prdd = self.ctx._jvm.PythonVertexRDD(rdd._jrdd, bytearray(pickled_command), env, includes, self.preserve_partitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, java_storage_level) self.jvertex_rdd = prdd.asJavaVertexRDD() # if enable_profile: # self.id = self.jvertex_rdd.id() # self.ctx._add_profile(self.id, profileStats) return self.jvertex_rdd
def jedge_rdd(self): if self.jerdd_val: return self.jerdd_val if self.bypass_serializer: self.jedge_rdd_deserializer = NoOpSerializer() enable_profile = self.ctx._conf.get("spark.python.profile", "false") == "true" profileStats = self.ctx.accumulator(None, PStatsParam) if enable_profile else None command = (self.func, profileStats, self.prev_jedge_rdd_deserializer, self.jedge_rdd_deserializer) # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) if len(pickled_command) > (1 << 20): # 1M self.broadcast = self.ctx.broadcast(pickled_command) pickled_command = ser.dumps(self.broadcast) broadcast_vars = ListConverter().convert( [x._jbroadcast for x in self.ctx._pickled_broadcast_vars], self.ctx._gateway._gateway_client) self.ctx._pickled_broadcast_vars.clear() env = MapConverter().convert(self.ctx.environment, self.ctx._gateway._gateway_client) includes = ListConverter().convert(self.ctx._python_includes, self.ctx._gateway._gateway_client) java_storage_level = self.ctx._getJavaStorageLevel(StorageLevel.MEMORY_ONLY) python_rdd = self.ctx._jvm.PythonEdgeRDD(self.prev_jedge_rdd, bytearray(pickled_command), env, includes, self.preservesPartitioning, self.ctx.pythonExec, broadcast_vars, self.ctx._javaAccumulator, java_storage_level) self.jerdd_val = python_rdd.asJavaEdgeRDD() if enable_profile: self.id = self.jerdd_val.id() self.ctx._add_profile(self.id, profileStats) return self.jerdd_val
def pytorch_dataloader(cls, dataloader, bigdl_type="float"): """ Create FeatureSet from pytorch dataloader :param dataloader: a pytorch dataloader :param bigdl_type: numeric type :return: A feature set """ node_num, core_num = get_node_and_core_number() if dataloader.batch_size % node_num != 0: true_bs = math.ceil(dataloader.batch_size / node_num) * node_num warning_msg = "Detect dataloader's batch_size is not divisible by node number(" + \ node_num + "), will adjust batch_size to " + true_bs + " automatically" warnings.warn(warning_msg) bys = CloudPickleSerializer.dumps(CloudPickleSerializer, dataloader) jvalue = callZooFunc(bigdl_type, "createFeatureSetFromPyTorch", bys) return cls(jvalue=jvalue)
def jsonValue(self): if self.scalaUDT(): assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT' schema = { "type": "udt", "class": self.scalaUDT(), "pyClass": "%s.%s" % (self.module(), type(self).__name__), "sqlType": self.sqlType().jsonValue() } else: ser = CloudPickleSerializer() b = ser.dumps(type(self)) schema = { "type": "udt", "pyClass": "%s.%s" % (self.module(), type(self).__name__), "serializedClass": base64.b64encode(b).decode('utf8'), "sqlType": self.sqlType().jsonValue() } return schema
def jsonValue(self): if self.scalaUDT(): assert self.module() != "__main__", "UDT in __main__ cannot work with ScalaUDT" schema = { "type": "udt", "class": self.scalaUDT(), "pyClass": "%s.%s" % (self.module(), type(self).__name__), "sqlType": self.sqlType().jsonValue(), } else: ser = CloudPickleSerializer() b = ser.dumps(type(self)) schema = { "type": "udt", "pyClass": "%s.%s" % (self.module(), type(self).__name__), "serializedClass": base64.b64encode(b).decode("utf8"), "sqlType": self.sqlType().jsonValue(), } return schema
def from_pytorch(criterion): bys = CloudPickleSerializer.dumps(CloudPickleSerializer, criterion) net = TorchLoss(bys) return net
def _create_judf(self, name): func = self.func from pyspark.sql import SQLContext sc = SparkContext.getOrCreate() # Empty strings allow the Scala code to recognize no data and skip adding the Jython # code to handle vars or imports if not needed. serialized_vars = "" serialized_imports = "" if isinstance(func, string_types): src = func else: try: import dill except ImportError: raise ImportError( "Failed to import dill, magic Jython function serialization " + "depends on dill on the driver machine. You may wish to pass " + "your function in as a string instead.") try: src = dill.source.getsource(func) except: print( "Failed to get the source code associated with provided function. " + "You may wish to try and assign you lambda to a variable or pass in as a " + "string.") raise # Extract the globals, classes, etc. needed for this function file = StringIO() cp = cloudpickle.CloudPickler(file) code, f_globals, defaults, closure, dct, base_globals = cp.extract_func_data( func) closure_dct = {} if func.__closure__: if sys.version < "3": closure_dct = dict( zip(func.func_code.co_freevars, (c.cell_contents for c in func.func_closure))) else: closure_dct = dict( zip(func.__code__.co_freevars, (c.cell_contents for c in func.__closure__))) req = dict(base_globals) req.update(f_globals) req.update(closure_dct) # Serialize the "extras" and drop PySpark imports ser = CloudPickleSerializer() def isClass(v): return isinstance(v, (type, types.ClassType)) def isInternal(v): return v.__module__.startswith("pyspark") # Sort out PySpark and non PySpark requirements req_vars = dict((k, v) for k, v in req.items() if not isClass(v) or not isInternal(v)) req_imports = dict( (k, v) for k, v in req.items() if isClass(v) and isInternal(v)) if req_vars: serialized_vars = b64encode( ser.dumps(req_vars)).decode("utf-8") if req_imports: formatted_imports = list((v.__module__, v.__name__, k) for k, v in req_imports.items()) serialized_imports = b64encode( ser.dumps(formatted_imports)).decode("utf-8") from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() jdt = spark._jsparkSession.parseDataType(self.returnType.json()) if name is None: f = self.func name = f.__name__ if hasattr(f, '__name__') else f.__class__.__name__ # Create a Java representation wrapped_jython_func = _wrap_jython_func(sc, src, serialized_vars, serialized_imports, self.setupCode) judf = sc._jvm.org.apache.spark.sql.jython.UserDefinedJythonFunction( name, wrapped_jython_func, jdt) return judf
def test_function_module_name(self): ser = CloudPickleSerializer() func = lambda x: x func2 = ser.loads(ser.dumps(func)) self.assertEqual(func.__module__, func2.__module__)
def pickle_command(command): # the serialized command will be compressed by broadcast ser = CloudPickleSerializer() pickled_command = ser.dumps(command) return pickled_command
def do_pickle(f, sc): command = (f, None, sc.serializer, sc.serializer) ser = CloudPickleSerializer() ser.dumps(command)