def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1"))
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix from sklearn import datasets # create the standardizer model for standardizing the dataset X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) ) scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd) X_sc = scaler.transform(X_rdd) #create the IndexedRowMatrix from rdd X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0]))) # compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether # to compute U or not. svd_o = X_rm.computeSVD(X_rm.numCols(), True) # svd_o.V is of shape n * k not k * n(as in sklearn) P_comps = svd_o.V.toArray().copy() num_rows = X_rm.numRows() # U is whitened and projected onto principal components subspace. S = svd_o.s.toArray() eig_vals = S**2 # change the ncomp to 3 for this tutorial #n_comp = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1 n_comp = 3 U = svd_o.U.rows.map(lambda x:(x.index, (np.sqrt(num_rows-1)*x.vector).tolist()[0:n_comp])) # K is our transformation matrix to obtain projection on PC's subspace K = (U/S).T[:n_comp]
def sparkComputeCost(self, input_file, x, y, theta): sc = SparkContext() # add the ones vector while building the RDD idx = 0 x_mat = sc.textFile(input_file) \ .map(lambda line: ('1, ' + line).split(",")[:-1]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) x_mat = IndexedRowMatrix( \ x_mat \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() x_mat.cache() print "Matrix rows x cols" print x_mat.numRows() print x_mat.numCols() vec = sc.parallelize(theta) \ .map(lambda line: [line]) \ .zipWithIndex() vec = IndexedRowMatrix( \ vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() vec.cache() print "Vector rows x cols" print vec.numRows() print vec.numCols() h = x_mat.multiply(vec) h.cache() print "Hypothesis rows x cols" print h.numRows() print h.numCols() y_vec = sc.textFile(input_file) \ .map(lambda line: [('1, ' + line).split(",")[-1]]) \ .zipWithIndex() y_vec = IndexedRowMatrix( \ y_vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() y_vec.cache() errors = h.subtract(y_vec).toLocalMatrix() print sum(errors.toArray()) '''sparkSession = SparkSession \ .builder \ .appName('pyspark') \ .getOrCreate() df = sparkSession.read.csv(input_file) df = df \ .toDF(x, y) \ .withColumn("Ones", psf.lit(1)) \ .cache() df.select(x,'Ones').show()''' '''sc = SparkContext('local', 'pyspark')