def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1"))
class ConjugateGradient: def __init__(self, args, sc): self.EPSILON = 1.0e-5 self.ctx = sc self.numPartitions = args.partitions self.numIterations = args.iterations self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) self.inputMatrix = IndexedRowMatrix(inputMatrixData) self.inputVector = readVector(self.inputVectorPath, self.ctx) if (self.numIterations == 0): self.numIterations = self.inputVector.size * 2 self.result = Vectors.zeros(self.inputVector.size) def solve(self): # print result stop = False start = time.clock() r = np.copy(self.inputVector) Ap = Vectors.zeros(self.inputMatrix.numRows()) # p = r p = np.copy(r) # rsold = r * r rsold = r.dot(r) rsold = r.dot(r) alpha = 0.0 rsnew = 0.0 k = 0 while (not stop): # Inicio -- Ap=A * p Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx) # Fin -- Ap=A * p # alpha=rsold / (p'*Ap) alpha = rsold / p.dot(Ap); # x=x+alpha * p self.result = self.result + alpha*p # r=r-alpha * Ap r = r - alpha*Ap # rsnew = r'*r rsnew = r.dot(r) if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))): stop = True # p=r+rsnew / rsold * p p = r + (rsnew/rsold) * p rsold = rsnew k += 1 # FIN GRADIENTE CONJUGADO end = time.clock() print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations." printVector(self.result) return self.result
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd) X_sc = scaler.transform(X_rdd) #create the IndexedRowMatrix from rdd X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0]))) # compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether # to compute U or not. svd_o = X_rm.computeSVD(X_rm.numCols(), True) # svd_o.V is of shape n * k not k * n(as in sklearn) P_comps = svd_o.V.toArray().copy() num_rows = X_rm.numRows() # U is whitened and projected onto principal components subspace. S = svd_o.s.toArray() eig_vals = S**2 # change the ncomp to 3 for this tutorial #n_comp = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1 n_comp = 3 U = svd_o.U.rows.map(lambda x:(x.index, (np.sqrt(num_rows-1)*x.vector).tolist()[0:n_comp])) # K is our transformation matrix to obtain projection on PC's subspace K = (U/S).T[:n_comp] import pyspark.sql.functions as f import pyspark.sql.types as t df = spark.createDataFrame(U).toDF("id", "features")
def sparkComputeCost(self, input_file, x, y, theta): sc = SparkContext() # add the ones vector while building the RDD idx = 0 x_mat = sc.textFile(input_file) \ .map(lambda line: ('1, ' + line).split(",")[:-1]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) x_mat = IndexedRowMatrix( \ x_mat \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() x_mat.cache() print "Matrix rows x cols" print x_mat.numRows() print x_mat.numCols() vec = sc.parallelize(theta) \ .map(lambda line: [line]) \ .zipWithIndex() vec = IndexedRowMatrix( \ vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() vec.cache() print "Vector rows x cols" print vec.numRows() print vec.numCols() h = x_mat.multiply(vec) h.cache() print "Hypothesis rows x cols" print h.numRows() print h.numCols() y_vec = sc.textFile(input_file) \ .map(lambda line: [('1, ' + line).split(",")[-1]]) \ .zipWithIndex() y_vec = IndexedRowMatrix( \ y_vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() y_vec.cache() errors = h.subtract(y_vec).toLocalMatrix() print sum(errors.toArray()) '''sparkSession = SparkSession \ .builder \ .appName('pyspark') \ .getOrCreate() df = sparkSession.read.csv(input_file) df = df \ .toDF(x, y) \ .withColumn("Ones", psf.lit(1)) \ .cache() df.select(x,'Ones').show()''' '''sc = SparkContext('local', 'pyspark')