def edgesToSDM(self, truncated_vertices): self.edges.createOrReplaceTempView("edges") truncated_vertices.createOrReplaceTempView("vertices") real_edges = self.spark.sql( "select * from edges where edges.src in (select id from vertices) or edges.dst in (select id from vertices)" ).persist(StorageLevel.MEMORY_AND_DISK) real_edges.createOrReplaceTempView("real_edges") noedge_vertices = self.spark.sql( "select * from vertices where vertices.id not in (select real_edges.src from real_edges) and vertices.id not in(select real_edges.dst from real_edges)" ) arti_edges = noedge_vertices.withColumnRenamed("id", "src").join( truncated_vertices.select("id").withColumnRenamed("id", "dst")) arti_edges = arti_edges.filter(F.col('src') != F.col('dst')) # src to dst entries_1 = real_edges.rdd.map( lambda row: MatrixEntry(row.src, row.dst, 1)) # dst to src entries_2 = real_edges.rdd.map( lambda row: MatrixEntry(row.dst, row.src, 1)) # self transition entries_3 = truncated_vertices.select("id").rdd.map( lambda row: MatrixEntry(row.id, row.id, 1)) # edges to avoid self-loop with no uncertainty (randomly distribute the importance of the current node [with artificial edges]) entries_4 = arti_edges.rdd.map( lambda row: MatrixEntry(row.src, row.dst, 1)) entries = entries_1.union(entries_2.union( entries_3.union(entries_4))).persist(StorageLevel.MEMORY_AND_DISK) size = truncated_vertices.count() self.edges_sdm = sdm.SparseDistributedMatrix(entries, size, size) return self.edges_sdm
def do_cartesian(sc, df, id_col=None, feature_col=None, **kwargs): import functools sigma = kwargs.get('sigma', 0.42) tol = kwargs.get('tol', 10e-10) standardize = kwargs.get('standardize', True) if isinstance(feature_col, list): feature_col, scaled_df = _make_feature_vector(df=df, feature_col=feature_col) if standardize: scaled_df = _scale_data_frame(scaled_df, vector=feature_col) if id_col: vector_dict = scaled_df.select(id_col, feature_col).rdd.collectAsMap() else: vector_dict = (scaled_df.select(feature_col) .rdd.zipWithIndex().map(lambda x: (x[1], x[0][feature_col])) .collectAsMap()) bc_vec = sc.broadcast(vector_dict) index_rdd = df.rdd.map(lambda x: x[id_col]).cache() bfs = functools.partial(_compute_bfs) cartesian_demon = index_rdd.cartesian(index_rdd).filter(lambda x: x[0] >= x[1]) cartesian_distance_demon = cartesian_demon.map( lambda x: MatrixEntry(x[0], x[1], bfs( vec_1=bc_vec.value.get(x[0]), vec_2=bc_vec.value.get(x[1]), sigma=sigma)) ) index_rdd.unpersist() # Memory cleanup! tol_cut = functools.partial(_tolerance_cut, tol=tol) return cartesian_distance_demon.filter(lambda x: tol_cut(x.value))
def build_matrix(svo_path: str, cat1_instances: set, cat2_instances: set) -> CoordinateMatrix: raw_df = spark.read.csv(svo_path, sep='\t') pairs_df = (raw_df.filter( (f.col('_c0').isin(cat1_instances) & f.col('_c2').isin(cat2_instances)) | (f.col('_c0').isin(cat2_instances) & f.col('_c2').isin(cat1_instances))).rdd.map(lambda x: (tuple( sorted((x['_c0'], x['_c2']))), x['_c1'], int(x['_c3']))).toDF( ['pair', 'verb', 'n'])) named_coords = (pairs_df.selectExpr('pair', 'verb as left_verb', 'n').join( pairs_df.selectExpr('pair', 'verb as right_verb'), 'pair').filter('left_verb < right_verb').groupby( ['left_verb', 'right_verb']).count()) verb_to_id = (pairs_df.select('verb').distinct().rdd.zipWithIndex().map( lambda r: [r[0].verb, r[1]]).toDF(['verb', 'id'])) coords = (named_coords.join( verb_to_id, named_coords.left_verb == verb_to_id.verb).selectExpr( 'right_verb', 'id as left_verb_id', 'count').join( verb_to_id, named_coords.right_verb == verb_to_id.verb).selectExpr( 'left_verb_id', 'id as right_verb_id', 'count')) matrix = CoordinateMatrix(coords.rdd.map(lambda c: MatrixEntry(*c))) return matrix
def MatrixTranspose( mat ): #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help. ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it, #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order ## use indexed matrix could partially fix this issue by reordering but this is too wierd ''' transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector :param mat: the input row matrix :return a transposed row matrix ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark ''' if isinstance(mat, IndexedRowMatrix): mat = mat.toRowMatrix() #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF( ).orderBy("index") # back to sparse first then convert to indexedrowmatrix transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow( row["index"], MLLibVectors.sparse( row["vector"].size, np.nonzero(row["vector"].values)[0], row["vector"].values[ np.nonzero(row["vector"].values)]))) return IndexedRowMatrix(transposed_mat)
def readRatings(spark, f_name, ratio=[0.8, 0.2], seed=0): """ Read the rating of users for movies Return the utility matrix""" df = spark.read.csv(f_name, header=True) #df = normalize(spark, df) rdd = df.rdd (training, test) = df.randomSplit(ratio, seed=seed) training_utility = CoordinateMatrix( training.rdd.map(lambda row: MatrixEntry(row['userId'], row[ 'movieId'], row['rating'])), users_total, movies_total) test_utility = CoordinateMatrix( test.rdd.map(lambda row: MatrixEntry(row['userId'], row[ 'movieId'], row['rating'])), users_total, movies_total) return (training_utility, test_utility)
def setUp(self): spark = SparkSession(sparkContext=self.sc) y = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]]) X_triangle = np.array([[1, 0, 0, 0, 0, 0], [2, 1, 0, 0, 0, 0], [3, 4, 1, 0, 0, 0], [5, 6, 7, 1, 0, 0], [1, 4, 2, 1, 1, 0], [1, 1, 1, 1, 1, 1]]) self.y_shape = y.shape self.longMessage = True self.X_shape = X_triangle.shape self.X_real = X_triangle+X_triangle.T-np.eye(6) self.product = self.X_real.dot(y) self.rdd_y = (self.sc.parallelize(y) .map(lambda x: x.tolist()).map(lambda x: list(enumerate(x))) .zipWithIndex() .flatMap(lambda x: [MatrixEntry(i=x[1], j=jdx, value=val) for jdx, val in x[0]]) .filter(lambda x: x.value != 0.)) self.rdd_X = (self.sc.parallelize(X_triangle) .map(lambda x: x.tolist()).map(lambda x: list(enumerate(x))) .zipWithIndex() .flatMap(lambda x: [MatrixEntry(i=x[1], j=jdx, value=val) for jdx, val in x[0]]) .filter(lambda x: x.value != 0.))
def multiply(self, B): a,b = self._pre_arithmetic_op(self, B) c = a.union(b).groupByKey().map( lambda x : MatrixEntry(x[0][0],x[0][1], x[1].data[0] * x[1].data[1] if len(x[1].data) == 2 else 0) ) return SparseDistributedMatrix(c, self.numRows(), self.numCols()) # def multiply(self, b:float): # c = self.entries.map( # lambda entry : MatrixEntry(entry.i, entry.j, entry.value * b) # ) # return SparseDistributedMatrix(self.sc, c, self.numRows(), self.numCols())
def _dot1(self, B): if self.numCols() != B.numRows(): raise Exception(f"size mismatch {(self.numRows(), self.numCols())}, {(B.numRows(), B.numCols())}") a = self._pre_dot(self, 'row') b = self._pre_dot(B, 'col') c = a.cartesian(b).map( lambda x: MatrixEntry(x[0][0], x[1][0], x[0][1].dot(x[1][1])) ).filter( lambda entry: entry.value != 0.0 ) return SparseDistributedMatrix(c, self.numRows(), B.numCols())
def newW(R, W, H): #W = np.multiply((X.dot(H.T))/(W.dot(H).dot(H.T)),W) a = R.multiply(H.transpose()).toCoordinateMatrix()\ .map(lambda entries:((entries.i,entries.j),(0,entries.value))) b = W.multiply(H).multiply(H.transpose()).toCoordinateMatrix()\ .map(lambda entries:((entries.i,entries.j),(1,entries.value))) c = a.union(b).reduceByKey(lambda a, b: (a[0] == 0 and (2, a[ 2] / b[2])) or (b[0] == 0 and 2, b[2] / a[2]) or b) #identify the right order of dividing c = c.map(lambda x: ((x[0][0], x[0][1]), x[1][1])) d = c.join(W.toCoordinateMatrix().map(lambda entries:((entries.i,entries.j),entries.value)))\ .reduceByKey(lambda a,b:a*b) return CoordinateMatrix( d.map(lambda x: MatrixEntry( (x[0][0], x[0][1]), x[1][1]))).toBlockMatrix()
def get_vectors_df(playcounts_df): """ Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times a user has played that recording. However, the correlation matrix requires a dataframe having a column of user vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x. An Indexed Row Matrix stores it as tuples of row index and vectors. Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the two. Finally, we take the rows and create a dataframe from them. """ tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"])) coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd) indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix() vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML())) return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])
def matrix_multiply(A, B): ''' This function returns the cross product between two matrices represented in Coordinate matrix format It is implemented by making simple joins. The code is implemented by refering to the scala implementation in the below link https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703 A: CoordinateMatrix Dataframe B: CoordinateMatrix Dataframe returns: CoordinateMatrix Dataframe of cross product between A and B ''' A_rdd = A.entries.map(lambda x: (x.j,(x.i,x.value))) # Convert dataframe to rdd of (column,(row, value)) B_rdd = B.entries.map(lambda x: (x.i,(x.j,x.value))) # Convert dataframe to rdd of (row,(column, value)) interm_rdd = A_rdd.join(B_rdd).map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1]*x[1][1][1]))) # Join two rdds and convert to ((row,column),(value)) C_rdd = interm_rdd.reduceByKey(add).map(lambda x: MatrixEntry(x[0][0],x[0][1],x[1])) # Add the product of same (row,column) pair and convert each row into a matrix entry of (row, column, value) return CoordinateMatrix(C_rdd)
def transpose(rm): cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) return cm.transpose().toRowMatrix()
def mapFuncJ(entry): return MatrixEntry(entry.j, entry.i + self.numDimI * entry.k, entry.val)
def mapFuncK(entry): return MatrixEntry(entry.k, entry.j + self.numDimJ * entry.i, entry.val)
from pyspark.mllib.linalg.distributed import DenseMatrix from pyspark.mllib.linalg.distributed import RowMatrix from pyspark.mllib.linalg.distributed import CoordinateMatrix from pyspark.mllib.linalg.distributed import MatrixEntry if __name__ == "__main__": # set up spark context and configuration conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample") sc = SparkContext(conf=conf) print(sc.getConf().getAll()) sqlContext = sql.SQLContext(sc) # load data data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt") entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2]))) # create RowMatrix premat = CoordinateMatrix(entries) mat = premat.toIndexedRowMatrix() print(mat.numCols()) print(mat.numRows()) # gramian start_time = time.time() decomp = mat.computeGramianMatrix() elapsedtime = time.time() - start_time print(elapsedtime) # svd
def mapFuncI(entry): return MatrixEntry(entry.i, entry.k + self.numDimK * entry.j, entry.val)
IndexedRow(1, [4, 5, 6]), IndexedRow(2, [7, 8, 9]), IndexedRow(3, [10, 11, 12]) ]) mat = IndexedRowMatrix(indexed) print(mat) # convert to row matrix rowMat = mat.toRowMatrix() print(rowMat) # A CoordinateMatrix is distributed and stored in an object called a coordinate list. from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry entries = sc.parallelize( [MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) mat = CoordinateMatrix(entries) m = mat.numRows() n = mat.number_columns() print(m) print(n) # convert to indexed row matrix rowMat = mat.toIndexedRowMatrix() print(rowMat)
def to_matrix_entry(x): i, j, v = x.split() return MatrixEntry(i, j, v)
def outer(self, v): c = self.rdd.cartesian(v.rdd).map(lambda x: MatrixEntry( x[0][0], x[1][0], float(x[0][1] * x[1][1]))).filter( lambda entry: entry.value != 0.0) return sdm.SparseDistributedMatrix(c, self.size, v.size)
def diag(vect): c = vect.rdd.map( lambda entry : MatrixEntry(entry[0], entry[0], entry[1]) ) return SparseDistributedMatrix(c, vect.size, vect.size)
def transpose(self): entries = self.entries.map( lambda entry: MatrixEntry(entry.j, entry.i, entry.value) ) return SparseDistributedMatrix(entries, self.numCols(), self.numRows())
coo_matrix_input_all.cache() # Diagonalize RDD diag_entries_1 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==0).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3),-value )); diag_entries_1.cache() diag_entries_2 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==1).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+1,-value )); diag_entries_2.cache() diag_entries_3 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==2).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+2,-value )); diag_entries_3.cache() diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3); coo_matrix_input_all = coo_matrix_input_all.union(diag_entries); coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2])); coo_matrix = CoordinateMatrix(coo_matrix_entries); #SAVE TO A FILE coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1") t2 = timeit.default_timer() print("Elapsed time for construction: {:} s".format(t2 - t0)) #Singular value decomposition dataRows = coo_matrix.toRowMatrix().rows k = int(args.k) #N_singvalues svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True)
def to_matrix_entry(s): ss = s.split() entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2])) return entry
.appName("linalgtest")\ .getOrCreate() #conf = SparkConf().setAppName('linalgtest') #sc = SparkContext(conf=conf).getOrCreate() #use local spark on computer # findspark.init() #from pyspark.sql import SparkSession local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx' rdd = spark.sparkContext.textFile(local_file_location) rdd = rdd.map(lambda line: line.split(" ")) rdd = rdd.map( lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2]))) mat = CoordinateMatrix(rdd) M = mat.toRowMatrix() A = mat.toBlockMatrix() At = mat.transpose().toBlockMatrix() print("SVD") print(M.numRows(), M.numCols()) start_svd = time.time() NUM_TIMES = 10 #do it 10 times to get mean for i in range(NUM_TIMES): svd = M.computeSVD(5, computeU=True)
import sys K = 5 ## Read data. txt = sc.textFile('./data/com-amazon.ungraph.txt') txt = txt.sample(False, 0.001, 1) # XXX: random sample for local testing txt = txt.zipWithIndex().filter(lambda x: int(x[1]) >= 4).map( lambda x: x[0].split('\t')) ## Get graph Laplacian N = txt.flatMap(lambda x: [int(xx) for xx in x]).max() upper_entries = txt.map( lambda x: MatrixEntry(int(x[0]) - 1, int(x[1]) - 1, 1.0)) lower_entries = txt.map( lambda x: MatrixEntry(int(x[1]) - 1, int(x[0]) - 1, 1.0)) degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey( lambda a, b: a + b) W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N) # XXX: laplacian = sys.argv[1] if laplacian == 'unnormalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1])) D = CoordinateMatrix(entries, numCols=N, numRows=N) L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix() elif laplacian == 'normalized':