def toPandas(self, df): """ This is similar to the Spark DataFrame built-in toPandas() method, but it handles MLlib Vector columns differently. It converts MLlib Vectors into rows of scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn. .. note:: Experimental: This will likely be replaced in later releases with improved APIs. :param df: Spark DataFrame :return: Pandas dataframe """ cols = df.columns # Convert any MLlib Vector columns to scipy.sparse.csr_matrix matrixCols = [] def toscipy(v): if isinstance(v, DenseVector): return csr_matrix( (v.values, np.array(range(v.size)), np.array([0, v.size])), shape=(1, v.size)) elif isinstance(v, SparseVector): return csr_matrix( (v.values, v.indices, np.array([0, len(v.indices)])), shape=(1, v.size)) else: raise TypeError( "Converter.toPandas found unknown Vector type: %s" % type(v)) tosparse = udf(lambda v: toscipy(v), CSRVectorUDT()) for i in range(len(cols)): c = cols[i] if isinstance(df.schema.fields[i].dataType, VectorUDT): cols[i] = tosparse(df[c]).alias(c) matrixCols.append(c) else: cols[i] = df[c] return df.select(*cols).toPandas()
from scipy.sparse import csr_matrix from spark_sklearn.converter import Converter from spark_sklearn.grid_search import GridSearchCV from spark_sklearn.udt import CSRVectorUDT __all__ = ['Converter', 'CSRVectorUDT', 'GridSearchCV'] csr_matrix.__UDT__ = CSRVectorUDT()