def process(sparkContext,sqlContext): # Define database connection parameters MYSQL_USERNAME = '******' MYSQL_PASSWORD = '******' MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"dblp.author_sample").load() rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect() colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect() rawData = df.map(lambda p: (long(rows.index(p.name_hash)),long(colums.index(p.paper_hash)),1.0)).cache() # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(rawData) rowMat = mat.toRowMatrix() print mat.numRows() # 3 print rowMat.numCols() # transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax)))) for r in rowMat.rows().collect(): print r
def process(sparkContext, sqlContext): # Define database connection parameters MYSQL_USERNAME = "******" MYSQL_PASSWORD = "******" MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"jdbc") .options(url=MYSQL_CONNECTION_URL, driver="com.mysql.jdbc.Driver", dbtable="dblp.author_sample") .load() ) rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect() colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect() rawData = df.map(lambda p: (long(rows.index(p.name_hash)), long(colums.index(p.paper_hash)), 1.0)).cache() # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(rawData) rowMat = mat.toRowMatrix() print mat.numRows() # 3 print rowMat.numCols() # transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax)))) for r in rowMat.rows().collect(): print r
def process(sparkContext,sqlContext): print("Building Graph...") G_apa = buildGraphAPA() print("Meta Path...") paths = metaPathAPA(G_apa) print("Training...") authorIndex = [] authorDegree = [] authors = paths[0] pathNumber = paths[2] pathNumberAarry = [] for pn in pathNumber.keys(): pathNumberAarry.append(str(pn)+":"+str(pathNumber.get(pn))) index = 0 for author in authors: authorDegree.append(str(author)+":"+str(len(G_apa[author]))) authorIndex.append(str(author)+":"+str(index)) index = index+1.0 # unique_authors = authors authorsRDD = sparkContext.parallelize(authors) authorIndex = sparkContext.parallelize(authorIndex) pathNumber = sparkContext.parallelize(pathNumberAarry) authorDegree = sparkContext.parallelize(authorDegree) authors = authorsRDD.collect() ai = authorIndex.collect() authorIndex = dict() for a in ai: p = a.split(":") authorIndex[p[0]]=p[1] # print authorIndex ad = authorDegree.collect() authorDegree = dict() for a in ad: p = a.split(":") authorDegree[p[0]]=p[1] # print authorDegree pn = pathNumber.collect() pathNumber = dict() for a in pn: p = a.split(":") pathNumber[p[0]]=p[1] # print pathNumber def matEntry(author,authors): row = [] # for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0)) else: key = str(author)+str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0)) return row def matEntryNoArgs(): row = [] for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0)) else: key = str(author)+str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0)) return row # print matEntry() print "memememememememmmmmmemmmm" me = authorsRDD.map(matEntry(author,authors)).collect()#.reduce(lambda x,y: x.append(y)) # me = matEntry() # me = matEntryNoArgs() print "memememememememmmmmmemmmmOoooooooooooooooo" entries = sc.parallelize(me) print "ssssssssssssssss" # # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # print mat # mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt") # Get its size. print mat.numRows() # 3 print mat.numCols() # 2
'imbalanced_binary_classification').getOrCreate() #new_df = spark.read.option("delimiter", " ").csv('data/1138_bus/1138_bus_no_head.mtx', header=False, inferSchema=True) #new_df.printSchema() rdd = sc.textFile('data/1138_bus/1138_bus_no_head.mtx') rdd = rdd.map(lambda line: line.split(" ")) rdd = rdd.map(lambda line: [float(x) for x in line]) print(rdd.take(2)) #ncol = len(rdd.map(lambda r: r.image).first()) nrows = rdd.count() ncols = 3 #matrix = Matrices.dense(nrows, ncols, rdd) print("ncol: %d, nrow %d" % (ncols, nrows)) coord_mat = CoordinateMatrix(rdd.map(tuple)) print("num rows in matrix %d" % coord_mat.numRows()) print("finished using pyspark") #________________________________________________- print("now use SparkSession") from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx', header=False, inferSchema=True) df_2.printSchema() #coord_mat_2 = CoordinateMatrix(df_2.rdd.map(tuple))
def process(sparkContext, sqlContext): print("Building Graph...") G_apa = buildGraphAPA() print("Meta Path...") paths = metaPathAPA(G_apa) print("Training...") authorIndex = [] authorDegree = [] authors = paths[0] pathNumber = paths[2] pathNumberAarry = [] for pn in pathNumber.keys(): pathNumberAarry.append(str(pn) + ":" + str(pathNumber.get(pn))) index = 0 for author in authors: authorDegree.append(str(author) + ":" + str(len(G_apa[author]))) authorIndex.append(str(author) + ":" + str(index)) index = index + 1.0 # unique_authors = authors authorsRDD = sparkContext.parallelize(authors) authorIndex = sparkContext.parallelize(authorIndex) pathNumber = sparkContext.parallelize(pathNumberAarry) authorDegree = sparkContext.parallelize(authorDegree) authors = authorsRDD.collect() ai = authorIndex.collect() authorIndex = dict() for a in ai: p = a.split(":") authorIndex[p[0]] = p[1] # print authorIndex ad = authorDegree.collect() authorDegree = dict() for a in ad: p = a.split(":") authorDegree[p[0]] = p[1] # print authorDegree pn = pathNumber.collect() pathNumber = dict() for a in pn: p = a.split(":") pathNumber[p[0]] = p[1] # print pathNumber def matEntry(author, authors): row = [] # for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 1.0)) else: key = str(author) + str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 2.0 * float(pathNumber.get(key)) / (float(authorDegree[author]) + float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 0.0)) return row def matEntryNoArgs(): row = [] for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 1.0)) else: key = str(author) + str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 2.0 * float(pathNumber.get(key)) / (float(authorDegree[author]) + float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 0.0)) return row # print matEntry() print "memememememememmmmmmemmmm" me = authorsRDD.map(matEntry( author, authors)).collect() #.reduce(lambda x,y: x.append(y)) # me = matEntry() # me = matEntryNoArgs() print "memememememememmmmmmemmmmOoooooooooooooooo" entries = sc.parallelize(me) print "ssssssssssssssss" # # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # print mat # mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt") # Get its size. print mat.numRows() # 3 print mat.numCols() # 2
import pyspark from pyspark import SparkContext from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry from operator import add from pyspark.sql import SparkSession sc = SparkContext() r=sc.textFile("part-00000") m=r.flatMap(lambda x: x.split('\n')).filter(lambda x : "A" in x).map(lambda x : (x.strip("A, ")).split(' ')).map(lambda x: tuple(list(map(int, x)))) #n=m.map(lambda x : MatrixEntry(tuple(x))) spark = SparkSession(sc) #m.toDF().show() print(hasattr(m,"toDF")) cmat=CoordinateMatrix(m) #mat = CoordinateMatrix(n) #o=mat.take(5) print(cmat.numRows()) # 3 print(cmat.numCols()) rowmat = cmat.toRowMatrix() print(rowmat.numRows()) # 3 print(rowmat.numCols())
def multiply_coordinate_matrices(left: CoordinateMatrix, right: CoordinateMatrix): """Multiply 2 spark Coordindate Matrices without converting either of them into a DenseMatrix. NOTE: spark does not provide distributed matrix multiplication of sparse matrices for this reason a custom approach has to be used which is discussed here https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703 """ def key_by_col(x): """Take a MatrixEntry of (row, col, val) and return a 2-tuple of (col, (row, val))""" return (x.j, (x.i, x.value)) def key_by_row(x): """Take a MatrixEntry of (row, col, val) and return a 2-tuple of (row, (col, val))""" return (x.i, (x.j, x.value)) left_by_col = left.entries.map(lambda x: key_by_col(x)) right_by_row = right.entries.map(lambda x: key_by_row(x)) # Next we perform a row by col matrix multiplication # where a shared "key" is used to group entries of the left matrix # with COLUMN j and entries of the right matrix with ROW j. # Note that entries with the same j will stick together. # This should be obvious if you recall that matrix multiplication # matches the index of the left column with the index of the right row. col_by_row = left_by_col.join(right_by_row) def row_by_col_multiplication(x): """The input is a key-pair tuple in the following format: (key, ((left_row, left_val), (right_col, right_val))) the output is a pair of tuples in the following format: ((left_row, right_col), (left_val, right_val)) Note that having finished the grouping we no longer need the shared key anymore, (i.e. we no longer need the original indices of the left_col or right_row). This is because summed values will go into the output matrix at the location (left_row, right_col) and thus we can regroup by these indices and sum """ return ((x[1][0][0], x[1][1][0]), (x[1][0][1] * x[1][1][1])) # multiply elements by the left matrix column and the right matrix row products = col_by_row.map(lambda x: row_by_col_multiplication(x)) # Sum up all the products for the a given left_row and right_col summed = products.reduceByKey(lambda accum, n: accum + n) # unnest the keys so we can convert back to a coordinate matrix flattened = summed.map(lambda x: (x[0][0], x[0][1], x[1])) res = CoordinateMatrix(flattened) log.info( "finished creating coord matrix from dot product", rows=res.numRows(), cols=res.numCols(), ) return res
def sparse_dot_product_cross_join( spark: SQLContext, output_col: str, primary_row_number_col: str, primary_vector_col: str, primary_df: DataFrame, secondary_row_number_col: str, secondary_vector_col: str, secondary_df: DataFrame, ): """Calculate the dot product for every pair of items between a column of SparseVectors in the primary dataframe and a column of SparseVectors in the secondary dataframe. The input dataframes must have a row number attached. This will correspond to the row number in ther resulting row matrix. It does not matter if the row numbers are sequential as long as they are unique within their dataframes respectively. NOTE: if you are using this function in order to generate cosine similarity scores then remember to normalize your input vectors first. This way the resulting coordinate matrix will represent the similarity scores.""" def primary_row_to_coords(row): """Convert a sparse vector to a list of coords in the format of (row_num, col_num, value)""" row_num = row.__getitem__(primary_row_number_col) vec = row.__getitem__(primary_vector_col) return [(row_num, i, j) for i, j in zip(vec.indices, vec.values)] primary_rdd = primary_df.select(F.col(primary_row_number_col), F.col(primary_vector_col)).rdd.flatMap( lambda row: primary_row_to_coords(row)) if primary_rdd.isEmpty(): raise ValueError( "Primary RDD is empty. Cannot perform matrix multiplication") primary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER) def secondary_row_to_coords(row): """Convert a sparse vector to a list of coords in the format of (row_num, col_num, value)""" row_num = row.__getitem__(secondary_row_number_col) vec = row.__getitem__(secondary_vector_col) # IMPORTANT - note that we are actually creating # the transpose of the secondary matrix hence # why the coordinates are back to front return [(i, row_num, j) for i, j in zip(vec.indices, vec.values)] secondary_rdd = secondary_df.select( F.col(secondary_row_number_col), F.col(secondary_vector_col)).rdd.flatMap( lambda row: secondary_row_to_coords(row)) secondary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER) if secondary_rdd.isEmpty(): raise ValueError( "Secondary RDD is empty. Cannot perform matrix multiplication") # create the primary coordinate matrix from the coords primary_matrix = CoordinateMatrix(primary_rdd) log.info( "finished creating primary coordinate matrix", rows=primary_matrix.numRows(), cols=primary_matrix.numCols(), ) # create the secondary coordinate matrix from the coords secondary_matrix = CoordinateMatrix(secondary_rdd) log.info( "finished creating secondary coordinate matrix transpose", rows=secondary_matrix.numRows(), cols=secondary_matrix.numCols(), ) coords_matrix = multiply_coordinate_matrices(primary_matrix, secondary_matrix) res = coord_matrix_to_dataframe( spark, primary_row_number_col, secondary_row_number_col, output_col, coords_matrix, ) primary_rdd.unpersist() secondary_rdd.unpersist() return res