def corr(x, y=None, method=None): """ Compute the correlation (matrix) for the input RDD(s) using the specified method. Methods currently supported: I{pearson (default), spearman}. If a single RDD of Vectors is passed in, a correlation matrix comparing the columns in the input RDD is returned. Use C{method=} to specify the method to be used for single RDD inout. If two RDDs of floats are passed in, a single float is returned. >>> x = sc.parallelize([1.0, 0.0, -2.0], 2) >>> y = sc.parallelize([4.0, 5.0, 3.0], 2) >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2) >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7 True >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson") True >>> Statistics.corr(x, y, "spearman") 0.5 >>> from math import isnan >>> isnan(Statistics.corr(x, zeros)) True >>> from linalg import Vectors >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]), ... Vectors.dense([6, 7, 0, 8]), Vectors.dense([9, 0, 0, 1])]) >>> Statistics.corr(rdd) array([[ 1. , 0.05564149, nan, 0.40047142], [ 0.05564149, 1. , nan, 0.91359586], [ nan, nan, 1. , nan], [ 0.40047142, 0.91359586, nan, 1. ]]) >>> Statistics.corr(rdd, method="spearman") array([[ 1. , 0.10540926, nan, 0.4 ], [ 0.10540926, 1. , nan, 0.9486833 ], [ nan, nan, 1. , nan], [ 0.4 , 0.9486833 , nan, 1. ]]) >>> try: ... Statistics.corr(rdd, "spearman") ... print "Method name as second argument without 'method=' shouldn't be allowed." ... except TypeError: ... pass """ sc = x.ctx # Check inputs to determine whether a single value or a matrix is needed for output. # Since it's legal for users to use the method name as the second argument, we need to # check if y is used to specify the method name instead. if type(y) == str: raise TypeError("Use 'method=' to specify method name.") if not y: try: Xser = _get_unmangled_double_vector_rdd(x) except TypeError: raise TypeError("corr called on a single RDD not consisted of Vectors.") resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method) return _deserialize_double_matrix(resultMat) else: xSer = _get_unmangled_rdd(x, _serialize_double) ySer = _get_unmangled_rdd(y, _serialize_double) result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method) return result
def train(cls, sc, data, k, maxIterations=100, runs=1, initialization_mode="k-means||"): """Train a k-means clustering model.""" dataBytes = _get_unmangled_double_vector_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainKMeansModel(dataBytes._jrdd, k, maxIterations, runs, initialization_mode) if len(ans) != 1: raise RuntimeError("JVM call result had unexpected length") elif type(ans[0]) != bytearray: raise RuntimeError("JVM call result had first element of type " + type(ans[0]) + " which is not bytearray") return KMeansModel(_deserialize_double_matrix(ans[0]))
def train(cls, sc, data, k, maxIterations=100, runs=1, initialization_mode="k-means||"): """Train a k-means clustering model.""" dataBytes = _get_unmangled_double_vector_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainKMeansModel( dataBytes._jrdd, k, maxIterations, runs, initialization_mode) if len(ans) != 1: raise RuntimeError("JVM call result had unexpected length") elif type(ans[0]) != bytearray: raise RuntimeError("JVM call result had first element of type " + type(ans[0]) + " which is not bytearray") return KMeansModel(_deserialize_double_matrix(ans[0]))
def train(cls, data, lambda_=1.0): """ Train a Naive Bayes model given an RDD of (label, features) vectors. This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can handle all kinds of discrete data. For example, by converting documents into TF-IDF vectors, it can be used for document classification. By making every vector a 0-1 vector, it can also be used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}). @param data: RDD of NumPy vectors, one per element, where the first coordinate is the label and the rest is the feature vector (e.g. a count vector). @param lambda_: The smoothing parameter """ sc = data.context dataBytes = _get_unmangled_double_vector_rdd(data) ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_) return NaiveBayesModel(_deserialize_double_vector(ans[0]), _deserialize_double_matrix(ans[1]))