def corr(x, y=None, method=None): """ Compute the correlation (matrix) for the input RDD(s) using the specified method. Methods currently supported: I{pearson (default), spearman}. If a single RDD of Vectors is passed in, a correlation matrix comparing the columns in the input RDD is returned. Use C{method=} to specify the method to be used for single RDD inout. If two RDDs of floats are passed in, a single float is returned. >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7 True >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson") True >>> Statistics.corr(x, y, "spearman") 0.5 >>> from math import isnan >>> isnan(Statistics.corr(x, zeros)) True >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), ... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])]) >>> pearsonCorr = Statistics.corr(rdd) >>> print str(pearsonCorr).replace('nan', 'NaN') [[ 1. 0.05564149 NaN 0.40047142] [ 0.05564149 1. NaN 0.91359586] [ NaN NaN 1. NaN] [ 0.40047142 0.91359586 NaN 1. ]] >>> spearmanCorr = Statistics.corr(rdd, method="spearman") >>> print str(spearmanCorr).replace('nan', 'NaN') [[ 1. 0.10540926 NaN 0.4 ] [ 0.10540926 1. NaN 0.9486833 ] [ NaN NaN 1. NaN] [ 0.4 0.9486833 NaN 1. ]] >>> try: ... Statistics.corr(rdd, "spearman") ... print "Method name as second argument without 'method=' shouldn't be allowed." ... except TypeError: ... pass """ sc = x.ctx # Check inputs to determine whether a single value or a matrix is needed for output. # Since it's legal for users to use the method name as the second argument, we need to # check if y is used to specify the method name instead. if type(y) == str: raise TypeError("Use 'method=' to specify method name.") jx = _to_java_object_rdd(x) if not y: resultMat = sc._jvm.PythonMLLibAPI().corr(jx, method) bytes = sc._jvm.SerDe.dumps(resultMat) ser = PickleSerializer() return ser.loads(str(bytes)).toArray() else: jy = _to_java_object_rdd(y) return sc._jvm.PythonMLLibAPI().corr(jx, jy, method)
def predict(self, x): """ Predict the label of one or more examples. :param x: Data point (feature vector), or an RDD of data points (feature vectors). """ SerDe = self._sc._jvm.SerDe ser = PickleSerializer() if isinstance(x, RDD): # Bulk prediction first = x.take(1) if not first: return self._sc.parallelize([]) if not isinstance(first[0], Vector): x = x.map(_convert_to_vector) jPred = self._java_model.predict(_to_java_object_rdd(x)).toJavaRDD() jpyrdd = self._sc._jvm.SerDe.javaToPython(jPred) return RDD(jpyrdd, self._sc, BatchedSerializer(ser, 1024)) else: # Assume x is a single data point. bytes = bytearray(ser.dumps(_convert_to_vector(x))) vec = self._sc._jvm.SerDe.loads(bytes) return self._java_model.predict(vec)
def colStats(rdd): """ Computes column-wise summary statistics for the input RDD[Vector]. >>> from pyspark.mllib.linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]), ... Vectors.dense([4, 5, 0, 3]), ... Vectors.dense([6, 7, 0, 8])]) >>> cStats = Statistics.colStats(rdd) >>> cStats.mean() array([ 4., 4., 0., 3.]) >>> cStats.variance() array([ 4., 13., 0., 25.]) >>> cStats.count() 3L >>> cStats.numNonzeros() array([ 3., 2., 0., 3.]) >>> cStats.max() array([ 6., 7., 0., 8.]) >>> cStats.min() array([ 2., 0., 0., -2.]) """ sc = rdd.ctx jrdd = _to_java_object_rdd(rdd) cStats = sc._jvm.PythonMLLibAPI().colStats(jrdd) return MultivariateStatisticalSummary(sc, cStats)
def _py2java(sc, a): """ Convert Python object into Java """ if isinstance(a, RDD): a = _to_java_object_rdd(a) elif not isinstance(a, (int, long, float, bool, basestring)): bytes = bytearray(PickleSerializer().dumps(a)) a = sc._jvm.SerDe.loads(bytes) return a
def _regression_train_wrapper(sc, train_func, modelClass, data, initial_weights): initial_weights = initial_weights or [0.0] * len(data.first().features) ser = PickleSerializer() initial_bytes = bytearray(ser.dumps(_convert_to_vector(initial_weights))) # use AutoBatchedSerializer before cache to reduce the memory # overhead in JVM cached = data._reserialize(AutoBatchedSerializer(ser)).cache() ans = train_func(_to_java_object_rdd(cached), initial_bytes) assert len(ans) == 2, "JVM call result had unexpected length" weights = ser.loads(str(ans[0])) return modelClass(weights, ans[1])
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): """Train a k-means clustering model.""" sc = rdd.context ser = PickleSerializer() # cache serialized data to avoid objects over head in JVM cached = rdd.map(_convert_to_vector)._reserialize(AutoBatchedSerializer(ser)).cache() model = sc._jvm.PythonMLLibAPI().trainKMeansModel( _to_java_object_rdd(cached), k, maxIterations, runs, initializationMode ) bytes = sc._jvm.SerDe.dumps(model.clusterCenters()) centers = ser.loads(str(bytes)) return KMeansModel([c.toArray() for c in centers])
def _prepare(cls, ratings): assert isinstance(ratings, RDD), "ratings should be RDD" first = ratings.first() if not isinstance(first, Rating): if isinstance(first, (tuple, list)): ratings = ratings.map(lambda x: Rating(*x)) else: raise ValueError("rating should be RDD of Rating or tuple/list") # serialize them by AutoBatchedSerializer before cache to reduce the # objects overhead in JVM cached = ratings._reserialize(AutoBatchedSerializer(PickleSerializer())).cache() return _to_java_object_rdd(cached)
def _train(data, type, numClasses, categoricalFeaturesInfo, impurity="gini", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0): first = data.first() assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint" sc = data.context jrdd = _to_java_object_rdd(data) cfiMap = MapConverter().convert(categoricalFeaturesInfo, sc._gateway._gateway_client) model = sc._jvm.PythonMLLibAPI().trainDecisionTreeModel( jrdd, type, numClasses, cfiMap, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain) return DecisionTreeModel(sc, model)
def _prepare(cls, ratings): assert isinstance(ratings, RDD), "ratings should be RDD" first = ratings.first() if not isinstance(first, Rating): if isinstance(first, (tuple, list)): ratings = ratings.map(lambda x: Rating(*x)) else: raise ValueError( "rating should be RDD of Rating or tuple/list") # serialize them by AutoBatchedSerializer before cache to reduce the # objects overhead in JVM cached = ratings._reserialize(AutoBatchedSerializer( PickleSerializer())).cache() return _to_java_object_rdd(cached)
def predictAll(self, user_product): assert isinstance(user_product, RDD), "user_product should be RDD of (user, product)" first = user_product.first() if isinstance(first, list): user_product = user_product.map(tuple) first = tuple(first) assert type(first) is tuple and len(first) == 2, \ "user_product should be RDD of (user, product)" if any(isinstance(x, str) for x in first): user_product = user_product.map(lambda (u, p): (int(x), int(p))) first = tuple(map(int, first)) assert all(type(x) is int for x in first), "user and product in user_product shoul be int" sc = self._context tuplerdd = sc._jvm.SerDe.asTupleRDD(_to_java_object_rdd(user_product).rdd()) jresult = self._java_model.predict(tuplerdd).toJavaRDD() return RDD(sc._jvm.SerDe.javaToPython(jresult), sc, AutoBatchedSerializer(PickleSerializer()))
def train(cls, rdd, k, maxIterations=100, runs=1, initializationMode="k-means||"): """Train a k-means clustering model.""" sc = rdd.context ser = PickleSerializer() # cache serialized data to avoid objects over head in JVM cached = rdd.map(_convert_to_vector)._reserialize( AutoBatchedSerializer(ser)).cache() model = sc._jvm.PythonMLLibAPI().trainKMeansModel( _to_java_object_rdd(cached), k, maxIterations, runs, initializationMode) bytes = sc._jvm.SerDe.dumps(model.clusterCenters()) centers = ser.loads(str(bytes)) return KMeansModel([c.toArray() for c in centers])
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). :param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). :param lambda_: The smoothing parameter """ sc = data.context jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes(_to_java_object_rdd(data), lambda_) labels, pi, theta = PickleSerializer().loads(str(sc._jvm.SerDe.dumps(jlist))) return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))
def fit(self, data): """ Computes the vector representation of each word in vocabulary. :param data: training data. RDD of subtype of Iterable[String] :return: python Word2VecModel instance """ sc = data.context ser = PickleSerializer() vectorSize = self.vectorSize learningRate = self.learningRate numPartitions = self.numPartitions numIterations = self.numIterations seed = self.seed model = sc._jvm.PythonMLLibAPI().trainWord2Vec( _to_java_object_rdd(data), vectorSize, learningRate, numPartitions, numIterations, seed) return Word2VecModel(sc, model)
def predictAll(self, user_product): assert isinstance(user_product, RDD), "user_product should be RDD of (user, product)" first = user_product.first() if isinstance(first, list): user_product = user_product.map(tuple) first = tuple(first) assert type(first) is tuple and len(first) == 2, \ "user_product should be RDD of (user, product)" if any(isinstance(x, str) for x in first): user_product = user_product.map(lambda (u, p): (int(x), int(p))) first = tuple(map(int, first)) assert all( type(x) is int for x in first), "user and product in user_product shoul be int" sc = self._context tuplerdd = sc._jvm.SerDe.asTupleRDD( _to_java_object_rdd(user_product).rdd()) jresult = self._java_model.predict(tuplerdd).toJavaRDD() return RDD(sc._jvm.SerDe.javaToPython(jresult), sc, AutoBatchedSerializer(PickleSerializer()))
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). :param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). :param lambda_: The smoothing parameter """ sc = data.context jlist = sc._jvm.PythonMLLibAPI().trainNaiveBayes( _to_java_object_rdd(data), lambda_) labels, pi, theta = PickleSerializer().loads( str(sc._jvm.SerDe.dumps(jlist))) return NaiveBayesModel(labels.toArray(), pi.toArray(), numpy.array(theta))