def __init__(self, args, sc): self.EPSILON = 1.0e-5 self.ctx = sc self.numPartitions = args.partitions self.numIterations = args.iterations self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) self.inputMatrix = IndexedRowMatrix(inputMatrixData) self.inputVector = readVector(self.inputVectorPath, self.ctx) if (self.numIterations == 0): self.numIterations = self.inputVector.size * 2 self.result = Vectors.zeros(self.inputVector.size)
def compute_similarity(df): """ Compute cosine :param df:dataframe of rating by user for movies :return: """ # df = df.filter(df.movieId.isin([91542.0, 1.0, 5.0, 90.0, 2541.0, 1246.0, 1552.0, 4084.0, 5679.0])) df = df.groupBy("userId").pivot("movieId").agg( first(col('rating')).cast("double")) mat = IndexedRowMatrix( df.rdd.map(lambda row: IndexedRow(row[0], Vectors.dense(row[1:])))) cs = mat.columnSimilarities() path = "test" cs.entries.toDF().write.parquet(path) cs.entries.toDF().coalesce(1)\ .write.format("com.databricks.spark.csv")\ .option("header", "true")\ .save("testtest.csv")
def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1"))
def multiply_matrices2(A: np.ndarray, B: np.ndarray) -> np.ndarray: listA = A.tolist() rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))]) matA = IndexedRowMatrix(rddA).toBlockMatrix() listB = B.tolist() rddB = sc.parallelize([IndexedRow(i, listB[i]) for i in range(len(listB))]) matB = IndexedRowMatrix(rddB).toBlockMatrix() matC = matA.multiply(matB).toLocalMatrix() return matC.toArray()
def readMovieChar(spark, f_name): my_data = list() with open(f_name, 'r') as handle: reader = csv.reader(handle, delimiter=",", quotechar='"') for row in reader: my_data.append(row) my_data.pop(0) matrix = np.zeros(shape=(int(my_data[-1][0]) + 1, len(movie_genre)), dtype=int) movie_list = dict() for movie in my_data: movie_id = int(movie[0]) movie_list[movie_id] = movie[1] genres = movie[2].split('|') for each in genres: col_idx = movie_genre.get(each, movie_genre['Other']) matrix[movie_id][col_idx] = 1 indexedRows = spark.sparkContext.parallelize( [IndexedRow(i, matrix[i]) for i in range(len(matrix))]) mat = IndexedRowMatrix(indexedRows) return mat, movie_list
def _getColumns(blockMat, j, norm=1): """ Returns column(s) j of the input BlockMatrix as a BlockMatrix with the same number of rowsPerBlock. """ sc = SparkContext.getOrCreate() if np.isscalar(j): colsPerBlock = blockMat.colsPerBlock jBlockCol = j // colsPerBlock jInBlock = j % colsPerBlock jBlocks = blockMat.blocks.filter(lambda x: x[0][1] == jBlockCol) def g(block): colJ = block[1].toArray()[:, jInBlock] / norm return ((block[0][0], 0), OldMatrices.dense(len(colJ), 1, colJ)) colJBlocks = jBlocks.map(g) return BlockMatrix(colJBlocks, rowsPerBlock=blockMat.rowsPerBlock, colsPerBlock=1, numCols=1) else: j_b = sc.broadcast(j) blockMat_red = blockMat.toIndexedRowMatrix() rows_red = blockMat_red.rows.map(lambda row: ( row.index, OldVectors.dense(row.vector.toArray()[j_b.value] / norm ))) j_b.unpersist() return IndexedRowMatrix(rows_red).toBlockMatrix( rowsPerBlock=blockMat.rowsPerBlock, colsPerBlock=min(len(j), blockMat.colsPerBlock))
def MatrixTranspose( mat ): #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help. ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it, #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order ## use indexed matrix could partially fix this issue by reordering but this is too wierd ''' transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector :param mat: the input row matrix :return a transposed row matrix ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark ''' if isinstance(mat, IndexedRowMatrix): mat = mat.toRowMatrix() #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF( ).orderBy("index") # back to sparse first then convert to indexedrowmatrix transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow( row["index"], MLLibVectors.sparse( row["vector"].size, np.nonzero(row["vector"].values)[0], row["vector"].values[ np.nonzero(row["vector"].values)]))) return IndexedRowMatrix(transposed_mat)
def _fit(self, dataset): inputCol = self.getInputCol() outputCol = self.getOutputCol() ds_rdd = dataset.select(inputCol).rdd m = IndexedRowMatrix(ds_rdd) mu = self.getMu() l = self.getL() if not mu: mu = 1.25 * mat.computeSVD(1).s print('mu:', mu) if not l: n_cols = mat.numCols() n_rows = mat.numRows() l = 1.0 / np.sqrt(np.max((n_cols, n_rows))) print('l:', l) pass
def matrix(self): """ Gets the matrix backing this LD matrix. :return: Matrix of Pearson correlation values. :rtype: `IndexedRowMatrix <https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix>`__ """ return IndexedRowMatrix(self._jldm.matrix())
def indexed_matrix_from_numpy(M): t = tuple(map(tuple, M)) t2 = range(len(t)) l = list(t) for i in t2: l[i] = tuple((i, l[i])) idxM = IndexedRowMatrix(sc.parallelize(l)) return idxM
def multiply_transpose2(A: np.array) -> np.ndarray: # A*A.T global counter print() print("No." + str(counter) + " matrix multiplication starts") start_time = time.time() print("matrix shape:", A.shape) listA = A.tolist() rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))]) matA = IndexedRowMatrix(rddA).toBlockMatrix() matT = matA.transpose() matR = matA.multiply(matT) res = matR.toLocalMatrix().toArray() elapsed_time = time.time() - start_time print("No." + str(counter) + " matrix multiplication ends, takes time:", elapsed_time) counter = counter + 1 return res
def test_row_matrix_invalid_type(self): rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) invalid_type = "" matrix = RowMatrix(rows) self.assertRaises(TypeError, matrix.multiply, invalid_type) irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])]) imatrix = IndexedRowMatrix(irows) self.assertRaises(TypeError, imatrix.multiply, invalid_type)
def vectorDFtoIndexedMatrix(df, vecvar, idcol): ''' applicable to dataframe already having assembled vectors ''' df = df.rdd.map(lambda row: IndexedRow( row[idcol], MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar]. values))) return IndexedRowMatrix(df)
def matrix(self): """ Gets the matrix backing this kinship matrix. :return: Matrix of kinship values. :rtype: `IndexedRowMatrix <https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.IndexedRowMatrix>`__ """ from pyspark.mllib.linalg.distributed import IndexedRowMatrix return IndexedRowMatrix(self._jkm.matrix())
def df_to_indexed_row_matrix(row_number_col: str, vector_col: str, df: DataFrame): """Convert a dataframe containing a row number and vector to a block matrix""" indexed_rows = (df.where(F.col(vector_col).isNotNull()).select( F.col(row_number_col), F.col(vector_col)).rdd.map( lambda row: IndexedRow(row.__getitem__(row_number_col), row.__getitem__(vector_col).toArray()))) if indexed_rows.isEmpty(): raise ValueError( "Primary RDD is empty. Cannot perform matrix multiplication") return IndexedRowMatrix(indexed_rows)
def DFtoIndexedMatrix(df, quantvars, idcol): ''' convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors ''' df = VectorAssembler( inputCols=quantvars, outputCol="features" ).transform(df).select( [idcol, "features"] ) #vector assembler turn it automatically to sparse matrix, so next line should be fine df = df.rdd.map(lambda row: IndexedRow( row[idcol], MLLibVectors.sparse(row.features.size, row.features.indices, row. features.values))) return IndexedRowMatrix(df)
def test_multiply_coordinate_matrices(self, spark: SQLContext): a_data = [(0, MllibVectors.dense(0, 3, 4)), (1, MllibVectors.dense(1, 2, 3))] b_data = [ (0, MllibVectors.dense(1, 0)), (1, MllibVectors.dense(4, 2)), (2, MllibVectors.dense(1, 3)), ] matrix_a = IndexedRowMatrix( spark._sc.parallelize(a_data)).toCoordinateMatrix() matrix_b = IndexedRowMatrix( spark._sc.parallelize(b_data)).toCoordinateMatrix() product = matrix.multiply_coordinate_matrices(matrix_a, matrix_b) actual = product.toBlockMatrix().toLocalMatrix().toArray() expected = [[16.0, 18.0], [12.0, 13.0]] assert actual.tolist() == expected
def _dist_matrix(self, rddv1, rddv2, sc): dlist1 = rddv1.collect() dlist2 = rddv2.collect() irows1 = [ IndexedRow(i, dlist1[i][0].toArray()) for i in range(0, len(dlist1)) ] irows2 = [ IndexedRow(i, dlist2[i][0].toArray()) for i in range(0, len(dlist2)) ] IMatrix1 = IndexedRowMatrix(sc.parallelize(irows1)) IMatrix2 = IndexedRowMatrix(sc.parallelize(irows2)) cart = IMatrix1.rows.cartesian(IMatrix2.rows) A = cart.map(lambda x: (x[0].index, x[1].index, np.sqrt( np.sum( np.power( np.array(x[0].vector) - np.array(x[ 1].vector), 2))))).collect() A.sort() Arr = self.__dist_array(A) return Arr
def getConnectivity(self,rddv,spark): sc = spark.sparkContext radius = self.getRadius() dist = self.getDistance() dlist = rddv.collect() featurecol = self.getFeaturesCol() irows = [IndexedRow(i,dlist[i][featurecol].toArray()) for i in range(0,len(dlist))] imatrix = IndexedRowMatrix(sc.parallelize(irows)) cart = imatrix.rows.cartesian(imatrix.rows) rows = Row("id","vector") usr_row = [rows(i,np.float_(x).tolist()) for i,x in enumerate(dlist)] verts = spark.createDataFrame(usr_row) A = cart.filter(lambda x : dist(x[0].vector,x[1].vector) <= radius).map(lambda x : (x[0].index, x[1].index, 1)) edges = spark.createDataFrame(A,['src','dst','connected']) return GraphFrame(verts,edges)
def __index_row_matrix_rdd(self, scale_df): """ :param scale_df: :return: """ try: vector_mllib = MLUtils.convertVectorColumnsFromML( scale_df, 'scaled_features').drop('features') vector_rdd = vector_mllib.select( 'scaled_features', 'id').rdd.map(lambda x: IndexedRow(x[1], x[0])) self.__logger.info("Build Index Row Matrix RDD") return IndexedRowMatrix(vector_rdd) except TypeError as te: raise OpheliaMLException( f"An error occurred while calling __index_row_matrix_rdd() method: {te}" )
def __init__(self, args, sc): self.ctx = sc self.numPartitions = args.partitions self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector self.alpha = args.alpha self.beta = args.beta # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) print "Number of rows in Matrix with type" + str(type(inputMatrixData)) + " is: " + str(inputMatrixData.count()) # PipelinedRDD to RDD # newData = sc.parallelize(inputMatrixData.collect()) inputMatrix = IndexedRowMatrix(inputMatrixData) inputVector = readVector(self.inputVectorPath, self.ctx) print "Vector size is: " + str(inputVector.size) result = Vectors.zeros(inputVector.size) # print result # DGEMV(alpha, A, x, beta, y, jsc): result = L2.DGEMV(self.alpha, inputMatrix, inputVector, self.beta, result, self.ctx) # writeVector(self.outputVectorPath, result) printVector(result)
def dense_matrix_cross_join( spark: SQLContext, output_col: str, primary_row_number_col: str, primary_matrix: IndexedRowMatrix, secondary_row_number_col: str, secondary_matrix: DenseMatrix, ): """Multiply 2 dense matrices to produce a dataframe with pairwise results showing primary row number, secondary column number and the dot product as a score Note that if you are using this method to produce the cosine similarity of 2 dense matrices then it is expected that you have already taken the transpose of the secondary matrix""" product = primary_matrix.multiply(secondary_matrix) log.info( "finished dense matrix multiplication", num_cols=product.numCols(), num_rows=product.numRows(), ) coords_matrix = product.toCoordinateMatrix() log.info( "finished converting row matrix to coordinate matrix", num_cols=coords_matrix.numCols(), num_rows=coords_matrix.numRows(), ) return coord_matrix_to_dataframe( spark, primary_row_number_col, secondary_row_number_col, output_col, coords_matrix, )
class ConjugateGradient: def __init__(self, args, sc): self.EPSILON = 1.0e-5 self.ctx = sc self.numPartitions = args.partitions self.numIterations = args.iterations self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) self.inputMatrix = IndexedRowMatrix(inputMatrixData) self.inputVector = readVector(self.inputVectorPath, self.ctx) if (self.numIterations == 0): self.numIterations = self.inputVector.size * 2 self.result = Vectors.zeros(self.inputVector.size) def solve(self): # print result stop = False start = time.clock() r = np.copy(self.inputVector) Ap = Vectors.zeros(self.inputMatrix.numRows()) # p = r p = np.copy(r) # rsold = r * r rsold = r.dot(r) rsold = r.dot(r) alpha = 0.0 rsnew = 0.0 k = 0 while (not stop): # Inicio -- Ap=A * p Ap = L2.DGEMV(1.0, self.inputMatrix, p, 0.0, Ap, self.ctx) # Fin -- Ap=A * p # alpha=rsold / (p'*Ap) alpha = rsold / p.dot(Ap); # x=x+alpha * p self.result = self.result + alpha*p # r=r-alpha * Ap r = r - alpha*Ap # rsnew = r'*r rsnew = r.dot(r) if ((math.sqrt(rsnew) <= self.EPSILON) or (k >= (self.numIterations))): stop = True # p=r+rsnew / rsold * p p = r + (rsnew/rsold) * p rsold = rsnew k += 1 # FIN GRADIENTE CONJUGADO end = time.clock() print "Total time in solve system is: " + str(end - start) + " and " + str(k) + " iterations." printVector(self.result) return self.result
# --.reduceByKey(lambda a,b: a+b)\ # --.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[0], reverse=True)))\ # --.map(lambda x: (x[0], [p[1] for p in x[1]]))\ # --.map(lambda x: x[1])\ # --.zipWithIndex() # ------------------------------------------ # do I have a 2D matrix now? print( "# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????") for item in final_stars_FINAL_READY.collect(): print(item) print( "# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw ........." ) iris_irm = IndexedRowMatrix( final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0]))) # ------------------------------------------ # https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/ # do SVD: num_of_top_sing_values = 2 SVD = iris_irm.computeSVD(num_of_top_sing_values, True) U = SVD.U S = SVD.s.toArray() # compute the eigenvalues and number of components to retain n = final_stars_FINAL_READY.count() eigvals = S**2 / (n - 1) eigvals = np.flipud(np.sort(eigvals)) cumsum = eigvals.cumsum()
data = sc.wholeTextFiles(root + folders[f]) data.cache() documents = data.map(lambda s: tokenize(s[1])).map( lambda s: remove_stopwords(s, stopwords)) files = data.map(lambda s: s[0]).collect() documents.cache() hashingTF = HashingTF() featurizedData = hashingTF.transform(documents) idf = IDF() idfModel = idf.fit(featurizedData) featurizedData.cache() tfidfs = idfModel.transform(featurizedData) tfidfs.cache() final_rdd = tfidfs.zipWithIndex().map(lambda s: IndexedRow(s[1], s[0])) final_rdd.cache() sims = IndexedRowMatrix(final_rdd).toCoordinateMatrix().transpose( ).toIndexedRowMatrix().columnSimilarities() pairs = sims.entries.map(lambda m: [m.i, m.j, m.value]).collect() for p in range(0, len(pairs)): pairs.append([pairs[p][1], pairs[p][0], pairs[p][2]]) results = [] for p in range(0, len(files)): results.append([p, 0, 0.0]) for p in range(0, len(pairs)): index = pairs[p][0] if pairs[p][2] > results[index][2]: results[index] = [index, pairs[p][1], pairs[p][2]] file_object = open("/home/user/out/" + folders[f] + ".csv", "w") for i in range(0, len(files)): file_object.write( str(results[i][0]) + ";" + str(results[i][1]) + ";" +
def as_block_matrix(rdd, rowsPerBlock=65000, colsPerBlock=65000): return IndexedRowMatrix( rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0])) ).toBlockMatrix(rowsPerBlock, colsPerBlock)
if __name__ == "__main__": if len(sys.argv) != 3: print( "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>", file=sys.stderr) exit(-1) #convert and process raw input to (bookid, [features]) def processFeatures(raw): features_str = raw.split() book_id = int(features_str[0]) features = [] for i in range(1, len(features_str)): features.append(float(features_str[i])) return (book_id, features) sc = SparkContext(appName="BookRecSystem") spark = SQLContext(sc) featureRdd = sc.textFile(sys.argv[1]) featureRdd = featureRdd.map(processFeatures) labels = featureRdd.map(lambda x: x[0]) #label_rdd fvecs = featureRdd.map(lambda x: Vectors.dense(x[1])) #feature_rdd data = labels.zip(fvecs) mat = IndexedRowMatrix(data).toBlockMatrix( ) #convert to block-matrix for pairwise cosine similarity dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map( lambda x: (x.index, x.vector.toArray())).sortByKey().map( lambda x: str(x[0]) + ' '.join(map(str, x[1])) ) #pairwise_cosine_similarity to rdd dot.saveAsTextFile(sys.argv[2]) #save output sc.stop()
from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext import numpy as np import os from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix os.environ["SPARK_HOME"] = "C:\\Users\\plfoley\\spark-2.3.1-bin-hadoop2.7" os.environ["HADOOP_HOME"] = "C:\\Users\\plfoley\\winutils" sc = SparkContext() rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) \ .zipWithIndex() rows2 = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) rows = IndexedRowMatrix( \ rows \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() rows2 = IndexedRowMatrix( \ rows2 \ .map(lambda row2: IndexedRow(row2[1], row2[0])) \ ).toBlockMatrix() mat_product = rows.multiply(rows2).toLocalMatrix() print(mat_product)
.getOrCreate() lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) articles = lines.map(lambda urls: getArticletText(urls)) hashingTF = HashingTF() tf = hashingTF.transform(articles) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) rows = tfidf.zipWithIndex() bm = IndexedRowMatrix(rows.map(lambda row : IndexedRow(row[1], row[0]))).toBlockMatrix() #bm_t = bm.transpose() #result_mat = bm.multiply(bm_t) #exact = result_mat.toIndexedRowMatrix().toRowMatrix() exact = bm.transpose().toIndexedRowMatrix().columnSimilarities() print(exact.entries.collect()) #print(exact.entries.collect()[0]) #parsedArticles = articles.collect() #tfidf = TfidfVectorizer().fit_transform(parsedArticles) #pairwise_similarity = tfidf * tfidf.T
def sparkComputeCost(self, input_file, x, y, theta): sc = SparkContext() # add the ones vector while building the RDD idx = 0 x_mat = sc.textFile(input_file) \ .map(lambda line: ('1, ' + line).split(",")[:-1]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) x_mat = IndexedRowMatrix( \ x_mat \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() x_mat.cache() print "Matrix rows x cols" print x_mat.numRows() print x_mat.numCols() vec = sc.parallelize(theta) \ .map(lambda line: [line]) \ .zipWithIndex() vec = IndexedRowMatrix( \ vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() vec.cache() print "Vector rows x cols" print vec.numRows() print vec.numCols() h = x_mat.multiply(vec) h.cache() print "Hypothesis rows x cols" print h.numRows() print h.numCols() y_vec = sc.textFile(input_file) \ .map(lambda line: [('1, ' + line).split(",")[-1]]) \ .zipWithIndex() y_vec = IndexedRowMatrix( \ y_vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() y_vec.cache() errors = h.subtract(y_vec).toLocalMatrix() print sum(errors.toArray()) '''sparkSession = SparkSession \ .builder \ .appName('pyspark') \ .getOrCreate() df = sparkSession.read.csv(input_file) df = df \ .toDF(x, y) \ .withColumn("Ones", psf.lit(1)) \ .cache() df.select(x,'Ones').show()''' '''sc = SparkContext('local', 'pyspark')
# Code for PCA and whitening the dataset. from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix from pyspark.mllib.feature import StandardScaler from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix from sklearn import datasets # create the standardizer model for standardizing the dataset X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) ) scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd) X_sc = scaler.transform(X_rdd) #create the IndexedRowMatrix from rdd X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0]))) # compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether # to compute U or not. svd_o = X_rm.computeSVD(X_rm.numCols(), True) # svd_o.V is of shape n * k not k * n(as in sklearn) P_comps = svd_o.V.toArray().copy() num_rows = X_rm.numRows() # U is whitened and projected onto principal components subspace. S = svd_o.s.toArray() eig_vals = S**2 # change the ncomp to 3 for this tutorial #n_comp = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1