def multiply_matrices2(A: np.ndarray, B: np.ndarray) -> np.ndarray: listA = A.tolist() rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))]) matA = IndexedRowMatrix(rddA).toBlockMatrix() listB = B.tolist() rddB = sc.parallelize([IndexedRow(i, listB[i]) for i in range(len(listB))]) matB = IndexedRowMatrix(rddB).toBlockMatrix() matC = matA.multiply(matB).toLocalMatrix() return matC.toArray()
def multiply_transpose2(A: np.array) -> np.ndarray: # A*A.T global counter print() print("No." + str(counter) + " matrix multiplication starts") start_time = time.time() print("matrix shape:", A.shape) listA = A.tolist() rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))]) matA = IndexedRowMatrix(rddA).toBlockMatrix() matT = matA.transpose() matR = matA.multiply(matT) res = matR.toLocalMatrix().toArray() elapsed_time = time.time() - start_time print("No." + str(counter) + " matrix multiplication ends, takes time:", elapsed_time) counter = counter + 1 return res
def dense_matrix_cross_join( spark: SQLContext, output_col: str, primary_row_number_col: str, primary_matrix: IndexedRowMatrix, secondary_row_number_col: str, secondary_matrix: DenseMatrix, ): """Multiply 2 dense matrices to produce a dataframe with pairwise results showing primary row number, secondary column number and the dot product as a score Note that if you are using this method to produce the cosine similarity of 2 dense matrices then it is expected that you have already taken the transpose of the secondary matrix""" product = primary_matrix.multiply(secondary_matrix) log.info( "finished dense matrix multiplication", num_cols=product.numCols(), num_rows=product.numRows(), ) coords_matrix = product.toCoordinateMatrix() log.info( "finished converting row matrix to coordinate matrix", num_cols=coords_matrix.numCols(), num_rows=coords_matrix.numRows(), ) return coord_matrix_to_dataframe( spark, primary_row_number_col, secondary_row_number_col, output_col, coords_matrix, )
if __name__ == "__main__": if len(sys.argv) != 3: print( "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>", file=sys.stderr) exit(-1) #convert and process raw input to (bookid, [features]) def processFeatures(raw): features_str = raw.split() book_id = int(features_str[0]) features = [] for i in range(1, len(features_str)): features.append(float(features_str[i])) return (book_id, features) sc = SparkContext(appName="BookRecSystem") spark = SQLContext(sc) featureRdd = sc.textFile(sys.argv[1]) featureRdd = featureRdd.map(processFeatures) labels = featureRdd.map(lambda x: x[0]) #label_rdd fvecs = featureRdd.map(lambda x: Vectors.dense(x[1])) #feature_rdd data = labels.zip(fvecs) mat = IndexedRowMatrix(data).toBlockMatrix( ) #convert to block-matrix for pairwise cosine similarity dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map( lambda x: (x.index, x.vector.toArray())).sortByKey().map( lambda x: str(x[0]) + ' '.join(map(str, x[1])) ) #pairwise_cosine_similarity to rdd dot.saveAsTextFile(sys.argv[2]) #save output sc.stop()
from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext import numpy as np import os from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix os.environ["SPARK_HOME"] = "C:\\Users\\plfoley\\spark-2.3.1-bin-hadoop2.7" os.environ["HADOOP_HOME"] = "C:\\Users\\plfoley\\winutils" sc = SparkContext() rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) \ .zipWithIndex() rows2 = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) rows = IndexedRowMatrix( \ rows \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() rows2 = IndexedRowMatrix( \ rows2 \ .map(lambda row2: IndexedRow(row2[1], row2[0])) \ ).toBlockMatrix() mat_product = rows.multiply(rows2).toLocalMatrix() print(mat_product)
wordsData = tokenizer.transform(dataFrame) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("title", "features").show() #Normalizacion y transformada de la matriz normalizer = Normalizer(inputCol="features", outputCol="norm") data = normalizer.transform(rescaledData) #Proceso de similaridad hallando la norma y el producto punto mat = IndexedRowMatrix( data.select("num", "norm")\ .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix() dot = mat.multiply(mat.transpose()) dot.toLocalMatrix().toArray() dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\ .select( psf.col("i.num").alias("i"), psf.col("j.num").alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j")\ .show() tempcosine = data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\ .select( psf.col("i.num").alias("i"), psf.col("j.num").alias("j"),
def sparkComputeCost(self, input_file, x, y, theta): sc = SparkContext() # add the ones vector while building the RDD idx = 0 x_mat = sc.textFile(input_file) \ .map(lambda line: ('1, ' + line).split(",")[:-1]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) x_mat = IndexedRowMatrix( \ x_mat \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() x_mat.cache() print "Matrix rows x cols" print x_mat.numRows() print x_mat.numCols() vec = sc.parallelize(theta) \ .map(lambda line: [line]) \ .zipWithIndex() vec = IndexedRowMatrix( \ vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() vec.cache() print "Vector rows x cols" print vec.numRows() print vec.numCols() h = x_mat.multiply(vec) h.cache() print "Hypothesis rows x cols" print h.numRows() print h.numCols() y_vec = sc.textFile(input_file) \ .map(lambda line: [('1, ' + line).split(",")[-1]]) \ .zipWithIndex() y_vec = IndexedRowMatrix( \ y_vec \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() y_vec.cache() errors = h.subtract(y_vec).toLocalMatrix() print sum(errors.toArray()) '''sparkSession = SparkSession \ .builder \ .appName('pyspark') \ .getOrCreate() df = sparkSession.read.csv(input_file) df = df \ .toDF(x, y) \ .withColumn("Ones", psf.lit(1)) \ .cache() df.select(x,'Ones').show()''' '''sc = SparkContext('local', 'pyspark')
import os os.environ["SPARK_HOME"] = "C:\spark" os.environ["HADOOP_HOME"] = "C:\winutils" from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix if __name__ == "__main__": sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) # IndexRowMatrix # Method 1: m = sc.parallelize([[2, 2, 2], [3, 3, 3]]).zipWithIndex() n = sc.parallelize([[1, 1], [4, 4], [3, 3]]).zipWithIndex() # Create an Index Row Matrix # Convert to Block Matrix mat1 = IndexedRowMatrix(m.map(lambda row: IndexedRow(row[1], row[0]))).toBlockMatrix() mat2 = IndexedRowMatrix(n.map(lambda row2: IndexedRow(row2[1], row2[0]))).toBlockMatrix() # Method 2: #mat1 = BlockMatrix(m, 2, 3) #mat2 = BlockMatrix(n, 3, 2) # Use of multiply function from pyspark.mllib.linalg mat_mul_output = mat1.multiply(mat2).toLocalMatrix() print(mat_mul_output)
data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt') data = data.map(lambda _ : np.array(_.strip().split()).astype(float)) data = data.map(lambda _ : _/np.linalg.norm(_)) U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0])) U = IndexedRowMatrix(U) UT = U.toCoordinateMatrix() UT = UT.transpose() U = U.toBlockMatrix() UT = UT.toBlockMatrix() S = U.multiply(UT) S_coord = S.toCoordinateMatrix() sim = S_coord.entries print(sim.take(100)) debug.TIMESTAMP(2)
# Open some context to allow for toDF function to work or something ?? sql.SQLContext(sc) data = sc.textFile(dataset) #data = (data.map(lambda s: (list(map(lambda x: float(x), s.split()))))).zipWithIndex().map(lambda x: ((x[1], 0), DenseMatrix(1, 1000, x[0]))) # Read matrix normally to format of (rownumber, vector) data = data.map(lambda s: (list(map(lambda x: float(x), s.split()))) ).zipWithIndex().map(lambda x: (x[1], x[0])) # Create a transpose for the matrix tdata = sc.textFile(dataset).map(lambda s: list( map(lambda x: (x[0], float(x[1])), enumerate(s.split())))).zipWithIndex().flatMap( lambda x: map(lambda y: (y[0], (x[1], y[1])), x[0])).groupByKey() # Map the transpose data to same format as normal matrix tdata = tdata.map(lambda x: (x[ 0], map(lambda s: s[1], sorted(list(x[1]), key=itemgetter(0))))) # Create BlockMatrix for the normal matrix and its transpose mat = IndexedRowMatrix(data) mat = mat.toBlockMatrix() matTranspose = IndexedRowMatrix(tdata).toBlockMatrix() # Get final result by multiplying mat * mat^T * mat matTranspose = mat.multiply(matTranspose) matRes = matTranspose.multiply(mat) print('Done')
# rule (A*AT)*A = A*(AT*A). # Calculate A transpose AT = A.transpose() # Make firts multiplication AT*A ATA = AT.multiply(A) # Print step 3 ready. With full set 9mins. print(" ") print("Step 3 ready") print("ATA rows and cols:") print("Rows:", ATA.numRows() , "Cols:", ATA.numCols()) print(" ") # Make second multiplication A*ATA AATA = A.multiply(ATA) # Print step 4 ready print(" ") print("Step 4 ready") print(" ") # Convert AATA to indexRowMatrix AATA = AATA.toIndexedRowMatrix() # Get first row from AATA first_row = AATA.rows.filter(lambda x: x.index == 0).first().vector.toArray() # Print first row print(first_row)
def calculate_distance(self, sdf1, sdf2): """ This will calculate the distance between the vector-type columns of two spark dataframes :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector) :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector) :return: """ cov = RowMatrix( sdf1.select(["v1"]).withColumnRenamed("v1", "v").union( sdf2.select(["v2"]).withColumnRenamed( "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict( )["v"]))).computeCovariance().toArray() x, v = np.linalg.eigh(cov) indices = 1e-10 <= x # we are trying to enfore the data types to be only python types n = int(v.shape[0]) m = int(indices.sum()) v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()] v_spark = DenseMatrix(n, m, v_vals) x_vals = [ float(val) for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist() ] x_spark = DenseMatrix(m, m, x_vals) # we get the index to maintain the order _sdf1 = sdf1.rdd.zipWithIndex()\ .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF() _sdf1.persist() _sdf2 = sdf2.rdd.zipWithIndex()\ .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF() _sdf2.persist() # we get our indexed row matrix _sdf1_mat = IndexedRowMatrix( _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v1"])))) _sdf2_mat = IndexedRowMatrix( _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v2"])))) # we apply our transformation and then set it as our new variable _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v1=indexed_row.vector)).toDF(), "index") _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v2=indexed_row.vector)).toDF(), "index") @F.udf(DoubleType(), VectorUDT()) def tmp(vec): return float(vec[0].squared_distance(vec[1]))**0.5 all_sdf = _sdf1.crossJoin(_sdf2) dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff')) dist_sdf.persist() return dist_sdf
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix from pyspark.sql import SQLContext from pyspark import SparkContext sc = SparkContext() rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) block_matrix = IndexedRowMatrix( \ rows \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() mat_product = block_matrix.multiply(block_matrix) result = mat_product.toLocalMatrix() print("Matrix Product \n", result) mat_sum = block_matrix.add(block_matrix) result = mat_sum.toLocalMatrix() print("Matrix Sum \n", result) mat_transpose = block_matrix.transpose() result = mat_transpose.toLocalMatrix() print("Matrix Transpose \n", result)