예제 #1
0
 def _test_serialize(self, v):
     ser = PickleSerializer()
     self.assertEqual(v, ser.loads(ser.dumps(v)))
     jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v)))
     nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec)))
     self.assertEqual(v, nv)
     vs = [v] * 100
     jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs)))
     nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs)))
     self.assertEqual(vs, nvs)
예제 #2
0
파일: test_linalg.py 프로젝트: apache/spark
 def _test_serialize(self, v):
     ser = PickleSerializer()
     self.assertEqual(v, ser.loads(ser.dumps(v)))
     jvec = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(v)))
     nv = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvec)))
     self.assertEqual(v, nv)
     vs = [v] * 100
     jvecs = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(vs)))
     nvs = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jvecs)))
     self.assertEqual(vs, nvs)
    def predict(self, x):
        """
        Predict the label of one or more examples.

        :param x:  Data point (feature vector),
                   or an RDD of data points (feature vectors).
        """
        SerDe = self._sc._jvm.SerDe
        ser = PickleSerializer()
        if isinstance(x, RDD):
            # Bulk prediction
            first = x.take(1)
            if not first:
                return self._sc.parallelize([])
            if not isinstance(first[0], Vector):
                x = x.map(_convert_to_vector)
            jPred = self._java_model.predict(x._to_java_object_rdd()).toJavaRDD()
            jpyrdd = self._sc._jvm.PythonRDD.javaToPython(jPred)
            return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024))

        else:
            # Assume x is a single data point.
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            return self._java_model.predict(vec)
예제 #4
0
 def test_als_ratings_id_long_error(self):
     ser = PickleSerializer()
     r = Rating(1205640308657491975, 50233468418, 1.0)
     # rating user id exceeds max int value, should fail when pickled
     self.assertRaises(
         Py4JJavaError,
         self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads,
         bytearray(ser.dumps(r)))
예제 #5
0
 def test_als_ratings_serialize(self):
     ser = PickleSerializer()
     r = Rating(7, 1123, 3.14)
     jr = self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(bytearray(ser.dumps(r)))
     nr = ser.loads(bytes(self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.dumps(jr)))
     self.assertEqual(r.user, nr.user)
     self.assertEqual(r.product, nr.product)
     self.assertAlmostEqual(r.rating, nr.rating, 2)
예제 #6
0
파일: context.py 프로젝트: iAmGhost/spark
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a
     L{Broadcast<pyspark.broadcast.Broadcast>}
     object for reading it in distributed functions. The variable will be
     sent to each cluster only once.
     """
     pickleSer = PickleSerializer()
     pickled = pickleSer.dumps(value)
     jbroadcast = self._jsc.broadcast(bytearray(pickled))
     return Broadcast(jbroadcast.id(), value, jbroadcast, self._pickled_broadcast_vars)
예제 #7
0
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a C{Broadcast}
     object for reading it in distributed functions. The variable will be
     sent to each cluster only once.
     """
     pickleSer = PickleSerializer()
     pickled = pickleSer.dumps(value)
     jbroadcast = self._jsc.broadcast(bytearray(pickled))
     return Broadcast(jbroadcast.id(), value, jbroadcast,
                      self._pickled_broadcast_vars)
예제 #8
0
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(cached._to_java_object_rdd(), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])
예제 #9
0
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights):
    initial_weights = initial_weights or [0.0] * len(data.first().features)
    ser = PickleSerializer()
    initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights)))
    # use AutoBatchedSerializer before cache to reduce the memory
    # overhead in JVM
    cached = data._reserialize(AutoBatchedSerializer(ser)).cache()
    ans = train_func(_to_java_object_rdd(cached), initial_bytes)
    assert len(ans) == 2, "JVM call result had unexpected length"
    weights = ser.loads(str(ans[0]))
    return modelClass(weights, ans[1])
예제 #10
0
    def findSynonyms(self, x, num):
        """
        :param x: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Find synonyms of a word

        Note: local use only
        """
        # TODO: make findSynonyms usable in RDD operations from python side
        ser = PickleSerializer()
        if type(x) == str:
            jlist = self._java_model.findSynonyms(x, num)
        else:
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            jlist = self._java_model.findSynonyms(vec, num)
        words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist)))
        return zip(words, similarity)
예제 #11
0
파일: feature.py 프로젝트: Ludwsam/spark
    def findSynonyms(self, x, num):
        """
        :param x: a word or a vector representation of word
        :param num: number of synonyms to find
        :return: array of (word, cosineSimilarity)

        Find synonyms of a word

        Note: local use only
        """
        # TODO: make findSynonyms usable in RDD operations from python side
        ser = PickleSerializer()
        if type(x) == str:
            jlist = self._java_model.findSynonyms(x, num)
        else:
            bytes = bytearray(ser.dumps(_convert_to_vector(x)))
            vec = self._sc._jvm.SerDe.loads(bytes)
            jlist = self._java_model.findSynonyms(vec, num)
        words, similarity = ser.loads(str(self._sc._jvm.SerDe.dumps(jlist)))
        return zip(words, similarity)
예제 #12
0
lrModel = lr.fit(spark_df_transformed)
lrModel.coefficientMatrix


#####################################
### Save and load model using PySpark
#####################################
lrModel.save('D:/lr-spark.dat')
from pyspark.ml.classification import LogisticRegressionModel
lr_loded = LogisticRegressionModel.load('D:/lr-spark.dat')

###############
### Serializers
###############
from pyspark.serializers import PickleSerializer
PickleSerializer.dumps(lrModel, 'D:/lr-spark-pickle.dat')
#######################################
### Column names in the spark DataFrame
#######################################
spark_df_transformed.columns
spark_df_transformed.printSchema()

#################################
### Schema of the spark DataFrame
#################################
spark_df_transformed.printSchema()
spark_df_transformed.stat
feat_vec = spark_df_transformed.select('features')
feat_vec.printSchema()
feat_vec.take(1)
예제 #13
0
 def test_als_ratings_id_long_error(self):
     ser = PickleSerializer()
     r = Rating(1205640308657491975, 50233468418, 1.0)
     # rating user id exceeds max int value, should fail when pickled
     self.assertRaises(Py4JJavaError, self.sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads,
                       bytearray(ser.dumps(r)))