def test_serialize(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = array([1., 2., 3., 4.]) lst = [1, 2, 3, 4] self.assertTrue(sv is _convert_vector(sv)) self.assertTrue(dv is _convert_vector(dv)) self.assertTrue(array_equal(dv, _convert_vector(lst))) self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(sv))) self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(dv)))) self.assertTrue(array_equal(dv, _deserialize_double_vector(_serialize_double_vector(lst))))
def test_serialize(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 lil[3, 0] = 2 sv = SparseVector(4, {1: 1, 3: 2}) self.assertEquals(sv, _convert_vector(lil)) self.assertEquals(sv, _convert_vector(lil.tocsc())) self.assertEquals(sv, _convert_vector(lil.tocoo())) self.assertEquals(sv, _convert_vector(lil.tocsr())) self.assertEquals(sv, _convert_vector(lil.todok())) self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil))) self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsc()))) self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.tocsr()))) self.assertEquals(sv, _deserialize_double_vector(_serialize_double_vector(lil.todok())))
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI() \ .poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map( lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). @param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). @param lambda_: The smoothing parameter """ sc = data.context dataBytes = _get_unmangled_double_vector_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_) return NaiveBayesModel( _deserialize_double_vector(ans[0]), _deserialize_double_vector(ans[1]), _deserialize_double_matrix(ans[2]) )
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). @param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). @param lambda_: The smoothing parameter """ sc = data.context dataBytes = _get_unmangled_double_vector_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_) return NaiveBayesModel(_deserialize_double_vector(ans[0]), _deserialize_double_vector(ans[1]), _deserialize_double_matrix(ans[2]))
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - 0.0) < 0.1 True >>> abs(mat.std() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI().normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) >>> mat.max() <= 1.0 and mat.min() >= 0.0 True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 """ jrdd = sc._jvm.PythonMLLibAPI().uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def uniformVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the uniform distribution U(0.0, 1.0). >>> import numpy as np >>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect()) >>> mat.shape (10, 10) >>> mat.max() <= 1.0 and mat.min() >= 0.0 True >>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions() 4 """ jrdd = sc._jvm.PythonMLLibAPI() \ .uniformVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) uniform = RDD(jrdd, sc, NoOpSerializer()) return uniform.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def normalVectorRDD(sc, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the standard normal distribution. >>> import numpy as np >>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1L).collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - 0.0) < 0.1 True >>> abs(mat.std() - 1.0) < 0.1 True """ jrdd = sc._jvm.PythonMLLibAPI() \ .normalVectorRDD(sc._jsc, numRows, numCols, numPartitions, seed) normal = RDD(jrdd, sc, NoOpSerializer()) return normal.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def poissonVectorRDD(sc, mean, numRows, numCols, numPartitions=None, seed=None): """ Generates an RDD comprised of vectors containing i.i.d. samples drawn from the Poisson distribution with the input mean. >>> import numpy as np >>> mean = 100.0 >>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1L) >>> mat = np.mat(rdd.collect()) >>> mat.shape (100, 100) >>> abs(mat.mean() - mean) < 0.5 True >>> from math import sqrt >>> abs(mat.std() - sqrt(mean)) < 0.5 True """ jrdd = sc._jvm.PythonMLLibAPI().poissonVectorRDD(sc._jsc, mean, numRows, numCols, numPartitions, seed) poisson = RDD(jrdd, sc, NoOpSerializer()) return poisson.map(lambda bytes: _deserialize_double_vector(bytearray(bytes)))
def min(self): return _deserialize_double_vector(self._java_summary.min())
def numNonzeros(self): return _deserialize_double_vector(self._java_summary.numNonzeros())
def variance(self): return _deserialize_double_vector(self._java_summary.variance())
def loads(self, obj): return _deserialize_double_vector(bytearray(obj))