예제 #1
0
파일: stat.py 프로젝트: BigCrunsh/spark
    def colStats(X):
        """
        Computes column-wise summary statistics for the input RDD[Vector].

        >>> from linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
        ...                       Vectors.dense([4, 5, 0,  3]),
        ...                       Vectors.dense([6, 7, 0,  8])])
        >>> cStats = Statistics.colStats(rdd)
        >>> cStats.mean()
        array([ 4.,  4.,  0.,  3.])
        >>> cStats.variance()
        array([  4.,  13.,   0.,  25.])
        >>> cStats.count()
        3L
        >>> cStats.numNonzeros()
        array([ 3.,  2.,  0.,  3.])
        >>> cStats.max()
        array([ 6.,  7.,  0.,  8.])
        >>> cStats.min()
        array([ 2.,  0.,  0., -2.])
        """
        sc = X.ctx
        Xser = _get_unmangled_double_vector_rdd(X)
        cStats = sc._jvm.PythonMLLibAPI().colStats(Xser._jrdd)
        return MultivariateStatisticalSummary(sc, cStats)
예제 #2
0
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> Statistics.corr(rdd)
        array([[ 1.        ,  0.05564149,         nan,  0.40047142],
               [ 0.05564149,  1.        ,         nan,  0.91359586],
               [        nan,         nan,  1.        ,         nan],
               [ 0.40047142,  0.91359586,         nan,  1.        ]])
        >>> Statistics.corr(rdd, method="spearman")
        array([[ 1.        ,  0.10540926,         nan,  0.4       ],
               [ 0.10540926,  1.        ,         nan,  0.9486833 ],
               [        nan,         nan,  1.        ,         nan],
               [ 0.4       ,  0.9486833 ,         nan,  1.        ]])
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")
        if not y:
            try:
                Xser = _get_unmangled_double_vector_rdd(x)
            except TypeError:
                raise TypeError("corr called on a single RDD not consisted of Vectors.")
            resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
            return _deserialize_double_matrix(resultMat)
        else:
            xSer = _get_unmangled_rdd(x, _serialize_double)
            ySer = _get_unmangled_rdd(y, _serialize_double)
            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
            return result
예제 #3
0
    def colStats(X):
        """
        Computes column-wise summary statistics for the input RDD[Vector].

        >>> from linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([2, 0, 0, -2]),
        ...                       Vectors.dense([4, 5, 0,  3]),
        ...                       Vectors.dense([6, 7, 0,  8])])
        >>> cStats = Statistics.colStats(rdd)
        >>> cStats.mean()
        array([ 4.,  4.,  0.,  3.])
        >>> cStats.variance()
        array([  4.,  13.,   0.,  25.])
        >>> cStats.count()
        3L
        >>> cStats.numNonzeros()
        array([ 3.,  2.,  0.,  3.])
        >>> cStats.max()
        array([ 6.,  7.,  0.,  8.])
        >>> cStats.min()
        array([ 2.,  0.,  0., -2.])
        """
        sc = X.ctx
        Xser = _get_unmangled_double_vector_rdd(X)
        cStats = sc._jvm.PythonMLLibAPI().colStats(Xser._jrdd)
        return MultivariateStatisticalSummary(sc, cStats)
예제 #4
0
파일: stat.py 프로젝트: stock20035/perrier
    def corr(x, y=None, method=None):
        """
        Compute the correlation (matrix) for the input RDD(s) using the
        specified method.
        Methods currently supported: I{pearson (default), spearman}.

        If a single RDD of Vectors is passed in, a correlation matrix
        comparing the columns in the input RDD is returned. Use C{method=}
        to specify the method to be used for single RDD inout.
        If two RDDs of floats are passed in, a single float is returned.

        >>> x = sc.parallelize([1.0, 0.0, -2.0], 2)
        >>> y = sc.parallelize([4.0, 5.0, 3.0], 2)
        >>> zeros = sc.parallelize([0.0, 0.0, 0.0], 2)
        >>> abs(Statistics.corr(x, y) - 0.6546537) < 1e-7
        True
        >>> Statistics.corr(x, y) == Statistics.corr(x, y, "pearson")
        True
        >>> Statistics.corr(x, y, "spearman")
        0.5
        >>> from math import isnan
        >>> isnan(Statistics.corr(x, zeros))
        True
        >>> from linalg import Vectors
        >>> rdd = sc.parallelize([Vectors.dense([1, 0, 0, -2]), Vectors.dense([4, 5, 0, 3]),
        ...                       Vectors.dense([6, 7, 0,  8]), Vectors.dense([9, 0, 0, 1])])
        >>> Statistics.corr(rdd)
        array([[ 1.        ,  0.05564149,         nan,  0.40047142],
               [ 0.05564149,  1.        ,         nan,  0.91359586],
               [        nan,         nan,  1.        ,         nan],
               [ 0.40047142,  0.91359586,         nan,  1.        ]])
        >>> Statistics.corr(rdd, method="spearman")
        array([[ 1.        ,  0.10540926,         nan,  0.4       ],
               [ 0.10540926,  1.        ,         nan,  0.9486833 ],
               [        nan,         nan,  1.        ,         nan],
               [ 0.4       ,  0.9486833 ,         nan,  1.        ]])
        >>> try:
        ...     Statistics.corr(rdd, "spearman")
        ...     print "Method name as second argument without 'method=' shouldn't be allowed."
        ... except TypeError:
        ...     pass
        """
        sc = x.ctx
        # Check inputs to determine whether a single value or a matrix is needed for output.
        # Since it's legal for users to use the method name as the second argument, we need to
        # check if y is used to specify the method name instead.
        if type(y) == str:
            raise TypeError("Use 'method=' to specify method name.")
        if not y:
            try:
                Xser = _get_unmangled_double_vector_rdd(x)
            except TypeError:
                raise TypeError("corr called on a single RDD not consisted of Vectors.")
            resultMat = sc._jvm.PythonMLLibAPI().corr(Xser._jrdd, method)
            return _deserialize_double_matrix(resultMat)
        else:
            xSer = _get_unmangled_rdd(x, _serialize_double)
            ySer = _get_unmangled_rdd(y, _serialize_double)
            result = sc._jvm.PythonMLLibAPI().corr(xSer._jrdd, ySer._jrdd, method)
            return result
예제 #5
0
 def train(cls, sc, data, k, maxIterations=100, runs=1,
         initialization_mode="k-means||"):
     """Train a k-means clustering model."""
     dataBytes = _get_unmangled_double_vector_rdd(data)
     ans = sc._jvm.PythonMLLibAPI().trainKMeansModel(dataBytes._jrdd,
             k, maxIterations, runs, initialization_mode)
     if len(ans) != 1:
         raise RuntimeError("JVM call result had unexpected length")
     elif type(ans[0]) != bytearray:
         raise RuntimeError("JVM call result had first element of type "
                 + type(ans[0]) + " which is not bytearray")
     return KMeansModel(_deserialize_double_matrix(ans[0]))
예제 #6
0
 def train(cls,
           sc,
           data,
           k,
           maxIterations=100,
           runs=1,
           initialization_mode="k-means||"):
     """Train a k-means clustering model."""
     dataBytes = _get_unmangled_double_vector_rdd(data)
     ans = sc._jvm.PythonMLLibAPI().trainKMeansModel(
         dataBytes._jrdd, k, maxIterations, runs, initialization_mode)
     if len(ans) != 1:
         raise RuntimeError("JVM call result had unexpected length")
     elif type(ans[0]) != bytearray:
         raise RuntimeError("JVM call result had first element of type " +
                            type(ans[0]) + " which is not bytearray")
     return KMeansModel(_deserialize_double_matrix(ans[0]))
예제 #7
0
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        dataBytes = _get_unmangled_double_vector_rdd(data)
        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd, lambda_)
        return NaiveBayesModel(_deserialize_double_vector(ans[0]), _deserialize_double_matrix(ans[1]))
예제 #8
0
파일: tree.py 프로젝트: chewy6i/spark
 def predict(self, x):
     """
     Predict the label of one or more examples.
     :param x:  Data point (feature vector),
                or an RDD of data points (feature vectors).
     """
     pythonAPI = self._sc._jvm.PythonMLLibAPI()
     if isinstance(x, RDD):
         # Bulk prediction
         if x.count() == 0:
             return self._sc.parallelize([])
         dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
         jSerializedPreds = \
             pythonAPI.predictDecisionTreeModel(self._java_model,
                                                dataBytes._jrdd)
         serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
         return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
     else:
         # Assume x is a single data point.
         x_ = _serialize_double_vector(x)
         return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
    def train(cls, data, lambda_=1.0):
        """
        Train a Naive Bayes model given an RDD of (label, features) vectors.

        This is the Multinomial NB (U{http://tinyurl.com/lsdw6p}) which can
        handle all kinds of discrete data.  For example, by converting
        documents into TF-IDF vectors, it can be used for document
        classification.  By making every vector a 0-1 vector, it can also be
        used as Bernoulli NB (U{http://tinyurl.com/p7c96j6}).

        @param data: RDD of NumPy vectors, one per element, where the first
               coordinate is the label and the rest is the feature vector
               (e.g. a count vector).
        @param lambda_: The smoothing parameter
        """
        sc = data.context
        dataBytes = _get_unmangled_double_vector_rdd(data)
        ans = sc._jvm.PythonMLLibAPI().trainNaiveBayes(dataBytes._jrdd,
                                                       lambda_)
        return NaiveBayesModel(_deserialize_double_vector(ans[0]),
                               _deserialize_double_matrix(ans[1]))
예제 #10
0
파일: tree.py 프로젝트: vardhan0707/spark
 def predict(self, x):
     """
     Predict the label of one or more examples.
     :param x:  Data point (feature vector),
                or an RDD of data points (feature vectors).
     """
     pythonAPI = self._sc._jvm.PythonMLLibAPI()
     if isinstance(x, RDD):
         # Bulk prediction
         if x.count() == 0:
             return self._sc.parallelize([])
         dataBytes = _get_unmangled_double_vector_rdd(x, cache=False)
         jSerializedPreds = \
             pythonAPI.predictDecisionTreeModel(self._java_model,
                                                dataBytes._jrdd)
         serializedPreds = RDD(jSerializedPreds, self._sc, NoOpSerializer())
         return serializedPreds.map(lambda bytes: _deserialize_double(bytearray(bytes)))
     else:
         # Assume x is a single data point.
         x_ = _serialize_double_vector(x)
         return pythonAPI.predictDecisionTreeModel(self._java_model, x_)
예제 #11
0
 def train(cls, rdd, epsilon, numOfPoints):
     sc = rdd.context
     jrdd = _get_unmangled_double_vector_rdd(rdd)._jrdd
     model = sc._jvm.PythonDbscanAPI().train(jrdd, epsilon, numOfPoints)
     return DbscanModel(model)
예제 #12
0
 def train(cls, rdd, epsilon, numOfPoints):
     sc = rdd.context
     jrdd = _get_unmangled_double_vector_rdd(rdd)._jrdd
     model = sc._jvm.PythonDbscanAPI().train(jrdd, epsilon, numOfPoints)
     return DbscanModel(model)