Exemplo n.º 1
0
    def toPandas(self, df):
        """
        This is similar to the Spark DataFrame built-in toPandas() method, but it handles
        MLlib Vector columns differently.  It converts MLlib Vectors into rows of
        scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn.

        .. note:: Experimental: This will likely be replaced in later releases with improved APIs.

        :param df: Spark DataFrame
        :return:  Pandas dataframe
        """
        cols = df.columns
        # Convert any MLlib Vector columns to scipy.sparse.csr_matrix
        matrixCols = []

        def toscipy(v):
            if isinstance(v, DenseVector):
                return csr_matrix(
                    (v.values, np.array(range(v.size)), np.array([0, v.size])),
                    shape=(1, v.size))
            elif isinstance(v, SparseVector):
                return csr_matrix(
                    (v.values, v.indices, np.array([0, len(v.indices)])),
                    shape=(1, v.size))
            else:
                raise TypeError(
                    "Converter.toPandas found unknown Vector type: %s" %
                    type(v))

        tosparse = udf(lambda v: toscipy(v), CSRVectorUDT())
        for i in range(len(cols)):
            c = cols[i]
            if isinstance(df.schema.fields[i].dataType, VectorUDT):
                cols[i] = tosparse(df[c]).alias(c)
                matrixCols.append(c)
            else:
                cols[i] = df[c]
        return df.select(*cols).toPandas()
Exemplo n.º 2
0
from scipy.sparse import csr_matrix

from spark_sklearn.converter import Converter
from spark_sklearn.grid_search import GridSearchCV
from spark_sklearn.udt import CSRVectorUDT

__all__ = ['Converter', 'CSRVectorUDT', 'GridSearchCV']

csr_matrix.__UDT__ = CSRVectorUDT()