def _test_serialize(self, v): ser = PickleSerializer() self.assertEqual(v, ser.loads(ser.dumps(v))) jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v))) nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec))) self.assertEqual(v, nv) vs = [v] * 100 jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs))) nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs))) self.assertEqual(vs, nvs)
def _test_serialize(self, v): ser = PickleSerializer() self.assertEqual(v, ser.loads(ser.dumps(v))) jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v))) nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec))) self.assertEqual(v, nv) vs = [v] * 100 jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs))) nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs))) self.assertEqual(vs, nvs)
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ SerDe = self._sc._jvm.SerDe ser = PickleSerializer() if isinstance(x, RDD): # Bulk prediction first = x.take(1) if not first: return self._sc.parallelize([]) if not isinstance(first[0], Vector): x = x.map(_convert_to_vector) jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD() jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred) return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024)) else: # Assume x is a single data point. bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) return self._java_model.predict(vec)
def test_als_ratings_id_long_error(self): ser = PickleSerializer() r = Rating(1205640308657491975, 50233468418, 1.0) # rating user id exceeds max int value, should fail when pickled self.assertRaises( Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads, bytearray(ser.dumps(r)))
def test_als_ratings_serialize(self): ser = PickleSerializer() r = Rating(7, 1123, 3.14) jr = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(r))) nr = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jr))) self.assertEqual(r.user, nr.user) self.assertEqual(r.product, nr.product) self.assertAlmostEqual(r.rating, nr.rating, 2)
def broadcast(self, value): """ Broadcast a read-only variable to the cluster, returning a L{Broadcast<pyspark.broadcast.Broadcast>} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ pickleSer = PickleSerializer() pickled = pickleSer.dumps(value) jbroadcast = self._jsc.broadcast(bytearray(pickled)) return Broadcast(jbroadcast.id(), value, jbroadcast, self._pickled_broadcast_vars)
def broadcast(self, value): """ Broadcast a read-only variable to the cluster, returning a C{Broadcast} object for reading it in distributed functions. The variable will be sent to each cluster only once. """ pickleSer = PickleSerializer() pickled = pickleSer.dumps(value) jbroadcast = self._jsc.broadcast(bytearray(pickled)) return Broadcast(jbroadcast.id(), value, jbroadcast, self._pickled_broadcast_vars)
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights): initial_weights = initial_weights or [0.0] * len(data.first().features) ser = PickleSerializer() initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights))) # use AutoBatchedSerializer before cache to reduce the memory # overhead in JVM cached = data._reserialize(AutoBatchedSerializer(ser)).cache() ans = train_func(cached._to_java_object_rdd(), initial_bytes) assert len(ans) == 2, "JVM call result had unexpected length" weights = ser.loads(str(ans[0])) return modelClass(weights, ans[1])
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights): initial_weights = initial_weights or [0.0] * len(data.first().features) ser = PickleSerializer() initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights))) # use AutoBatchedSerializer before cache to reduce the memory # overhead in JVM cached = data._reserialize(AutoBatchedSerializer(ser)).cache() ans = train_func(_to_java_object_rdd(cached), initial_bytes) assert len(ans) == 2, "JVM call result had unexpected length" weights = ser.loads(str(ans[0])) return modelClass(weights, ans[1])
def findSynonyms(self, x, num): """ :param x: a word or a vector representation of word :param num: number of synonyms to find :return: array of (word, cosineSimilarity) Find synonyms of a word Note: local use only """ # TODO: make findSynonyms usable in RDD operations from python side ser = PickleSerializer() if type(x) == str: jlist = self._java_model.findSynonyms(x, num) else: bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) jlist = self._java_model.findSynonyms(vec, num) words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist))) return zip(words, similarity)
def findSynonyms(self, x, num): """ :param x: a word or a vector representation of word :param num: number of synonyms to find :return: array of (word, cosineSimilarity) Find synonyms of a word Note: local use only """ # TODO: make findSynonyms usable in RDD operations from python side ser = PickleSerializer() if type(x) == str: jlist = self._java_model.findSynonyms(x, num) else: bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) jlist = self._java_model.findSynonyms(vec, num) words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist))) return zip(words, similarity)
lrModel = lr.fit(spark_df_transformed) lrModel.coefficientMatrix ##################################### ### Save and load model using PySpark ##################################### lrModel.save('D:/lr-spark.dat') from pyspark.ml.classification import LogisticRegressionModel lr_loded = LogisticRegressionModel.load('D:/lr-spark.dat') ############### ### Serializers ############### from pyspark.serializers import PickleSerializer PickleSerializer.dumps(lrModel, 'D:/lr-spark-pickle.dat') ####################################### ### Column names in the spark DataFrame ####################################### spark_df_transformed.columns spark_df_transformed.printSchema() ################################# ### Schema of the spark DataFrame ################################# spark_df_transformed.printSchema() spark_df_transformed.stat feat_vec = spark_df_transformed.select('features') feat_vec.printSchema() feat_vec.take(1)
def test_als_ratings_id_long_error(self): ser = PickleSerializer() r = Rating(1205640308657491975, 50233468418, 1.0) # rating user id exceeds max int value, should fail when pickled self.assertRaises(Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads, bytearray(ser.dumps(r)))