def __init__(self, args, sc): self.EPSILON = 1.0e-5 self.ctx = sc self.numPartitions = args.partitions self.numIterations = args.iterations self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) self.inputMatrix = IndexedRowMatrix(inputMatrixData) self.inputVector = readVector(self.inputVectorPath, self.ctx) if (self.numIterations == 0): self.numIterations = self.inputVector.size * 2 self.result = Vectors.zeros(self.inputVector.size)
def test_row_matrix_invalid_type(self): rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) invalid_type = "" matrix = RowMatrix(rows) self.assertRaises(TypeError, matrix.multiply, invalid_type) irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])]) imatrix = IndexedRowMatrix(irows) self.assertRaises(TypeError, imatrix.multiply, invalid_type)
def multiply_matrices2(A: np.ndarray, B: np.ndarray) -> np.ndarray: listA = A.tolist() rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))]) matA = IndexedRowMatrix(rddA).toBlockMatrix() listB = B.tolist() rddB = sc.parallelize([IndexedRow(i, listB[i]) for i in range(len(listB))]) matB = IndexedRowMatrix(rddB).toBlockMatrix() matC = matA.multiply(matB).toLocalMatrix() return matC.toArray()
def readMovieChar(spark, f_name): my_data = list() with open(f_name, 'r') as handle: reader = csv.reader(handle, delimiter=",", quotechar='"') for row in reader: my_data.append(row) my_data.pop(0) matrix = np.zeros(shape=(int(my_data[-1][0]) + 1, len(movie_genre)), dtype=int) movie_list = dict() for movie in my_data: movie_id = int(movie[0]) movie_list[movie_id] = movie[1] genres = movie[2].split('|') for each in genres: col_idx = movie_genre.get(each, movie_genre['Other']) matrix[movie_id][col_idx] = 1 indexedRows = spark.sparkContext.parallelize( [IndexedRow(i, matrix[i]) for i in range(len(matrix))]) mat = IndexedRowMatrix(indexedRows) return mat, movie_list
def MatrixTranspose( mat ): #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help. ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it, #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order ## use indexed matrix could partially fix this issue by reordering but this is too wierd ''' transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector :param mat: the input row matrix :return a transposed row matrix ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark ''' if isinstance(mat, IndexedRowMatrix): mat = mat.toRowMatrix() #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF( ).orderBy("index") # back to sparse first then convert to indexedrowmatrix transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow( row["index"], MLLibVectors.sparse( row["vector"].size, np.nonzero(row["vector"].values)[0], row["vector"].values[ np.nonzero(row["vector"].values)]))) return IndexedRowMatrix(transposed_mat)
def compute_similarity(df): """ Compute cosine :param df:dataframe of rating by user for movies :return: """ # df = df.filter(df.movieId.isin([91542.0, 1.0, 5.0, 90.0, 2541.0, 1246.0, 1552.0, 4084.0, 5679.0])) df = df.groupBy("userId").pivot("movieId").agg( first(col('rating')).cast("double")) mat = IndexedRowMatrix( df.rdd.map(lambda row: IndexedRow(row[0], Vectors.dense(row[1:])))) cs = mat.columnSimilarities() path = "test" cs.entries.toDF().write.parquet(path) cs.entries.toDF().coalesce(1)\ .write.format("com.databricks.spark.csv")\ .option("header", "true")\ .save("testtest.csv")
def vectorDFtoIndexedMatrix(df, vecvar, idcol): ''' applicable to dataframe already having assembled vectors ''' df = df.rdd.map(lambda row: IndexedRow( row[idcol], MLLibVectors.sparse(row[vecvar].size, row[vecvar].indices, row[vecvar]. values))) return IndexedRowMatrix(df)
def __init__(self, args, sc): self.ctx = sc self.numPartitions = args.partitions self.inputVectorPath = args.inputVector self.inputMatrixPath = args.inputMatrix self.outputVectorPath = args.outputVector self.alpha = args.alpha self.beta = args.beta # Read Matrix input data # inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx) if (self.numPartitions != 0): inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1]))\ .repartition(self.numPartitions) else: inputMatrixData = readMatrixRowPerLine(self.inputMatrixPath, self.ctx)\ .map(lambda line: IndexedRow(line[0], line[1])) print "Number of rows in Matrix with type" + str(type(inputMatrixData)) + " is: " + str(inputMatrixData.count()) # PipelinedRDD to RDD # newData = sc.parallelize(inputMatrixData.collect()) inputMatrix = IndexedRowMatrix(inputMatrixData) inputVector = readVector(self.inputVectorPath, self.ctx) print "Vector size is: " + str(inputVector.size) result = Vectors.zeros(inputVector.size) # print result # DGEMV(alpha, A, x, beta, y, jsc): result = L2.DGEMV(self.alpha, inputMatrix, inputVector, self.beta, result, self.ctx) # writeVector(self.outputVectorPath, result) printVector(result)
def df_to_indexed_row_matrix(row_number_col: str, vector_col: str, df: DataFrame): """Convert a dataframe containing a row number and vector to a block matrix""" indexed_rows = (df.where(F.col(vector_col).isNotNull()).select( F.col(row_number_col), F.col(vector_col)).rdd.map( lambda row: IndexedRow(row.__getitem__(row_number_col), row.__getitem__(vector_col).toArray()))) if indexed_rows.isEmpty(): raise ValueError( "Primary RDD is empty. Cannot perform matrix multiplication") return IndexedRowMatrix(indexed_rows)
def _dist_matrix(self, rddv1, rddv2, sc): dlist1 = rddv1.collect() dlist2 = rddv2.collect() irows1 = [ IndexedRow(i, dlist1[i][0].toArray()) for i in range(0, len(dlist1)) ] irows2 = [ IndexedRow(i, dlist2[i][0].toArray()) for i in range(0, len(dlist2)) ] IMatrix1 = IndexedRowMatrix(sc.parallelize(irows1)) IMatrix2 = IndexedRowMatrix(sc.parallelize(irows2)) cart = IMatrix1.rows.cartesian(IMatrix2.rows) A = cart.map(lambda x: (x[0].index, x[1].index, np.sqrt( np.sum( np.power( np.array(x[0].vector) - np.array(x[ 1].vector), 2))))).collect() A.sort() Arr = self.__dist_array(A) return Arr
def DFtoIndexedMatrix(df, quantvars, idcol): ''' convert a numeric dataframe to a rowmatrix with sparse vector as basic units, won't be applicable to dataframe already having assembled vectors ''' df = VectorAssembler( inputCols=quantvars, outputCol="features" ).transform(df).select( [idcol, "features"] ) #vector assembler turn it automatically to sparse matrix, so next line should be fine df = df.rdd.map(lambda row: IndexedRow( row[idcol], MLLibVectors.sparse(row.features.size, row.features.indices, row. features.values))) return IndexedRowMatrix(df)
def getConnectivity(self,rddv,spark): sc = spark.sparkContext radius = self.getRadius() dist = self.getDistance() dlist = rddv.collect() featurecol = self.getFeaturesCol() irows = [IndexedRow(i,dlist[i][featurecol].toArray()) for i in range(0,len(dlist))] imatrix = IndexedRowMatrix(sc.parallelize(irows)) cart = imatrix.rows.cartesian(imatrix.rows) rows = Row("id","vector") usr_row = [rows(i,np.float_(x).tolist()) for i,x in enumerate(dlist)] verts = spark.createDataFrame(usr_row) A = cart.filter(lambda x : dist(x[0].vector,x[1].vector) <= radius).map(lambda x : (x[0].index, x[1].index, 1)) edges = spark.createDataFrame(A,['src','dst','connected']) return GraphFrame(verts,edges)
def multiply_transpose2(A: np.array) -> np.ndarray: # A*A.T global counter print() print("No." + str(counter) + " matrix multiplication starts") start_time = time.time() print("matrix shape:", A.shape) listA = A.tolist() rddA = sc.parallelize([IndexedRow(i, listA[i]) for i in range(len(listA))]) matA = IndexedRowMatrix(rddA).toBlockMatrix() matT = matA.transpose() matR = matA.multiply(matT) res = matR.toLocalMatrix().toArray() elapsed_time = time.time() - start_time print("No." + str(counter) + " matrix multiplication ends, takes time:", elapsed_time) counter = counter + 1 return res
def __index_row_matrix_rdd(self, scale_df): """ :param scale_df: :return: """ try: vector_mllib = MLUtils.convertVectorColumnsFromML( scale_df, 'scaled_features').drop('features') vector_rdd = vector_mllib.select( 'scaled_features', 'id').rdd.map(lambda x: IndexedRow(x[1], x[0])) self.__logger.info("Build Index Row Matrix RDD") return IndexedRowMatrix(vector_rdd) except TypeError as te: raise OpheliaMLException( f"An error occurred while calling __index_row_matrix_rdd() method: {te}" )
import os os.environ["SPARK_HOME"] = "C:\spark" os.environ["HADOOP_HOME"] = "C:\winutils" from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix if __name__ == "__main__": sc = SparkContext.getOrCreate() sqlContext = SQLContext(sc) # IndexRowMatrix # Method 1: m = sc.parallelize([[2, 2, 2], [3, 3, 3]]).zipWithIndex() n = sc.parallelize([[1, 1], [4, 4], [3, 3]]).zipWithIndex() # Create an Index Row Matrix # Convert to Block Matrix mat1 = IndexedRowMatrix(m.map(lambda row: IndexedRow(row[1], row[0]))).toBlockMatrix() mat2 = IndexedRowMatrix(n.map(lambda row2: IndexedRow(row2[1], row2[0]))).toBlockMatrix() # Method 2: #mat1 = BlockMatrix(m, 2, 3) #mat2 = BlockMatrix(n, 3, 2) # Use of multiply function from pyspark.mllib.linalg mat_mul_output = mat1.multiply(mat2).toLocalMatrix() print(mat_mul_output)
# --.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[0], reverse=True)))\ # --.map(lambda x: (x[0], [p[1] for p in x[1]]))\ # --.map(lambda x: x[1])\ # --.zipWithIndex() # ------------------------------------------ # do I have a 2D matrix now? print( "# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????") for item in final_stars_FINAL_READY.collect(): print(item) print( "# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw ........." ) iris_irm = IndexedRowMatrix( final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0]))) # ------------------------------------------ # https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/ # do SVD: num_of_top_sing_values = 2 SVD = iris_irm.computeSVD(num_of_top_sing_values, True) U = SVD.U S = SVD.s.toArray() # compute the eigenvalues and number of components to retain n = final_stars_FINAL_READY.count() eigvals = S**2 / (n - 1) eigvals = np.flipud(np.sort(eigvals)) cumsum = eigvals.cumsum()
print(f) print(root + folders[f]) data = sc.wholeTextFiles(root + folders[f]) data.cache() documents = data.map(lambda s: tokenize(s[1])).map( lambda s: remove_stopwords(s, stopwords)) files = data.map(lambda s: s[0]).collect() documents.cache() hashingTF = HashingTF() featurizedData = hashingTF.transform(documents) idf = IDF() idfModel = idf.fit(featurizedData) featurizedData.cache() tfidfs = idfModel.transform(featurizedData) tfidfs.cache() final_rdd = tfidfs.zipWithIndex().map(lambda s: IndexedRow(s[1], s[0])) final_rdd.cache() sims = IndexedRowMatrix(final_rdd).toCoordinateMatrix().transpose( ).toIndexedRowMatrix().columnSimilarities() pairs = sims.entries.map(lambda m: [m.i, m.j, m.value]).collect() for p in range(0, len(pairs)): pairs.append([pairs[p][1], pairs[p][0], pairs[p][2]]) results = [] for p in range(0, len(files)): results.append([p, 0, 0.0]) for p in range(0, len(pairs)): index = pairs[p][0] if pairs[p][2] > results[index][2]: results[index] = [index, pairs[p][1], pairs[p][2]] file_object = open("/home/user/out/" + folders[f] + ".csv", "w")
def as_block_matrix(rdd, rowsPerBlock=65000, colsPerBlock=65000): return IndexedRowMatrix( rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0])) ).toBlockMatrix(rowsPerBlock, colsPerBlock)
from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext import numpy as np import os from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix, BlockMatrix os.environ["SPARK_HOME"] = "C:\\Users\\plfoley\\spark-2.3.1-bin-hadoop2.7" os.environ["HADOOP_HOME"] = "C:\\Users\\plfoley\\winutils" sc = SparkContext() rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) \ .zipWithIndex() rows2 = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) \ .zipWithIndex() # need a SQLContext() to generate an IndexedRowMatrix from RDD sqlContext = SQLContext(sc) rows = IndexedRowMatrix( \ rows \ .map(lambda row: IndexedRow(row[1], row[0])) \ ).toBlockMatrix() rows2 = IndexedRowMatrix( \ rows2 \ .map(lambda row2: IndexedRow(row2[1], row2[0])) \ ).toBlockMatrix() mat_product = rows.multiply(rows2).toLocalMatrix() print(mat_product)
.getOrCreate() lines = spark.read.text(sys.argv[1]).rdd.map(lambda r: r[0]) articles = lines.map(lambda urls: getArticletText(urls)) hashingTF = HashingTF() tf = hashingTF.transform(articles) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) rows = tfidf.zipWithIndex() bm = IndexedRowMatrix(rows.map(lambda row : IndexedRow(row[1], row[0]))).toBlockMatrix() #bm_t = bm.transpose() #result_mat = bm.multiply(bm_t) #exact = result_mat.toIndexedRowMatrix().toRowMatrix() exact = bm.transpose().toIndexedRowMatrix().columnSimilarities() print(exact.entries.collect()) #print(exact.entries.collect()[0]) #parsedArticles = articles.collect() #tfidf = TfidfVectorizer().fit_transform(parsedArticles) #pairwise_similarity = tfidf * tfidf.T
# final_stars_FINAL_READY = final_stars_FINAL.rdd\ # --.map(lambda x: (x[0], [(x[1], x[2])]))\ # --.reduceByKey(lambda a,b: a+b)\ # --.map(lambda x: (x[0], sorted(x[1], key=lambda x: x[0], reverse=True)))\ # --.map(lambda x: (x[0], [p[1] for p in x[1]]))\ # --.map(lambda x: x[1])\ # --.zipWithIndex() # ------------------------------------------ # do I have a 2D matrix now? print("# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????") for item in final_stars_FINAL_READY.collect(): print(item) print("# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw .........") iris_irm = IndexedRowMatrix(final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0]))) # ------------------------------------------ # https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/ # do SVD: num_of_top_sing_values = 2 SVD = iris_irm.computeSVD(num_of_top_sing_values, True)
tokenizer = Tokenizer(inputCol="text", outputCol="words") wordsData = tokenizer.transform(dataFrame) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("title", "features").show() #Normalizacion y transformada de la matriz normalizer = Normalizer(inputCol="features", outputCol="norm") data = normalizer.transform(rescaledData) #Proceso de similaridad hallando la norma y el producto punto mat = IndexedRowMatrix( data.select("num", "norm")\ .rdd.map(lambda row: IndexedRow(row.num, row.norm.toArray()))).toBlockMatrix() dot = mat.multiply(mat.transpose()) dot.toLocalMatrix().toArray() dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType()) data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\ .select( psf.col("i.num").alias("i"), psf.col("j.num").alias("j"), dot_udf("i.norm", "j.norm").alias("dot"))\ .sort("i", "j")\ .show() tempcosine = data.alias("i").join(data.alias("j"), psf.col("i.num") < psf.col("j.num"))\ .select( psf.col("i.num").alias("i"),
dv2[2] dv1.size dv2.toArray() from pyspark.mllib.linalg import Matrices dm = Matrices.dense(2, 3, [5.0, 0.0, 0.0, 3.0, 1.0, 4.0]) sm = Matrices.sparse(2, 3, [0, 1, 2, 4], [0, 1, 0, 1], [5.0, 3.0, 1.0, 4.0]) sm.toDense() dm.toSparse() dm[1, 1] #Section 7.2.2 from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow rmind = IndexedRowMatrix( rm.rows().zipWithIndex().map(lambda x: IndexedRow(x[1], x[0]))) #Section 7.4 housingLines = sc.textFile("first-edition/ch07/housing.data", 6) housingVals = housingLines.map( lambda x: Vectors.dense([float(v.strip()) for v in x.split(",")])) #Section 7.4.1 from pyspark.mllib.linalg.distributed import RowMatrix housingMat = RowMatrix(housingVals) from pyspark.mllib.stat._statistics import Statistics housingStats = Statistics.colStats(housingVals) housingStats.min() #Section 7.4.4 from pyspark.mllib.regression import LabeledPoint
mat = RowMatrix(rows) m = mat.numRows() n = mat.number_columns() print(m) print(n) # An IndexedRowMatrix is similar to a RowMatrix but has row indices, which can be used to identify specific rows, # which is useful for executing join. from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix # a RDD of indexed rows indexed = sc.parallelize([ IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6]), IndexedRow(2, [7, 8, 9]), IndexedRow(3, [10, 11, 12]) ]) mat = IndexedRowMatrix(indexed) print(mat) # convert to row matrix rowMat = mat.toRowMatrix() print(rowMat) # A CoordinateMatrix is distributed and stored in an object called a coordinate list. from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
conf = SparkConf().setAppName("labeledPoints") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") debug = Debugger() debug.TIMESTAMP(1) spark = SparkSession(sc) data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt') data = data.map(lambda _ : np.array(_.strip().split()).astype(float)) data = data.map(lambda _ : _/np.linalg.norm(_)) U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0])) U = IndexedRowMatrix(U) UT = U.toCoordinateMatrix() UT = UT.transpose() U = U.toBlockMatrix() UT = UT.toBlockMatrix() S = U.multiply(UT) S_coord = S.toCoordinateMatrix()
def get_Total_Related_Downloads(self, dfmain): #total downloads download_count = dfmain.groupby(['_id'])['_id'].agg(['count']) #build datasets vs ip similarity matrix group = pd.DataFrame({ 'download_count': dfmain.groupby(['_id', 'ip']).size() }).reset_index() person_u = list(group.ip.unique()) dataset_u = list(group._id.unique()) outF = open(self.DATA_LIST_FILE, "w") for line in dataset_u: outF.write(str(line)) outF.write("\n") outF.close() data = group['download_count'].tolist() row = group._id.astype('category', categories=dataset_u).cat.codes cols = group.ip.astype('category', categories=person_u).cat.codes len_dataset = len(dataset_u) len_person = len(person_u) print("Datasets vs Ips :", str(len_dataset), str(len_person)) #(309235, 81566) sparsemat = sparse.csr_matrix((data, (row, cols)), dtype=np.int8, shape=(len_dataset, len_person)) m, n = sparsemat.shape def f(x): d = {} for i in range(len(x)): d[str(i)] = float(x[i]) return d # load PySpark using findSpark package #SparkContext.setSystemProperty('spark.executor.memory', '5g') #SparkContext.setSystemProperty('spark.driver.memory', '5g') #SparkContext.setSystemProperty('spark.executor.heartbeatInterval', '1000000000s') #conf = SparkConf().setAppName("simdownload") #conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G'))#.set('spark.executor.heartbeatInterval','1000000s') #sc = SparkContext(conf=conf) #sc = SparkContext("local", "simdownload") sc = SparkContext(appName="simdownload") sqlContext = SQLContext(sc) #print(sc._conf.getAll()) sv_rdd = sc.parallelize(sparsemat.toarray()) #populate the values from rdd to dataframe dfspark = sv_rdd.map(lambda x: Row(**f(x))).toDF() row_with_index = Row(*["id"] + dfspark.columns) def make_row(columns): def _make_row(row, uid): row_dict = row.asDict() return row_with_index(*[uid] + [row_dict.get(c) for c in columns]) return _make_row print('parallelize-ok') f = make_row(dfspark.columns) # create a new dataframe with id column (use indexes) dfidx = (dfspark.rdd.zipWithIndex().map(lambda x: f(*x)).toDF( StructType([StructField("id", LongType(), False)] + dfspark.schema.fields))) #compute cosine sim by rows pred = IndexedRowMatrix( dfidx.rdd.map(lambda row: IndexedRow(row.id, row[1:]))) pred1 = pred.toBlockMatrix().transpose().toIndexedRowMatrix() pred_sims = pred1.columnSimilarities() #convert coordinatematrix (pred_sims) into a dataframe columns = ['from', 'to', 'sim'] vals = pred_sims.entries.map(lambda e: (e.i, e.j, e.value)) dfsim = sqlContext.createDataFrame(vals, columns) print('Sim Done!') print('Time Sim Done: ' + time.strftime("%H:%M:%S")) json_data = {} for i in range(m): target_id = int(dataset_u[i]) dftemp = dfsim.where((psf.col("from") == i) | (psf.col("to") == i)).sort( psf.desc("sim")).limit( self.num_top_dataset) df = dftemp.toPandas() # v = df.iloc[:, :-1].values # ii = np.arange(len(df))[:, None] # ji = np.argsort(v == i, axis=1) # replace `1` with your ID # related_ids = (v[ii, ji][:, 0]).tolist() # related_datasets = [dataset_u[i] for i in related_ids] myarr = [] for index, rw in df.iterrows( ): #this is a bit faster than numpy above from_id = rw['from'] to_id = rw['to'] if (from_id != i): myarr.append(int(from_id)) if (to_id != i): myarr.append(int(to_id)) related_datasets = [int(dataset_u[i]) for i in myarr] downloads = download_count.loc[target_id]['count'] data = {} data['related_datasets'] = related_datasets data['total_downloads'] = int(downloads) json_data[target_id] = data print('Time JSONUSAGE_FILE 1: ' + time.strftime("%H:%M:%S")) with open(self.JSONUSAGE_FILE, 'w') as fp: json.dump(json_data, fp) print('Time JSONUSAGE_FILE 2: ' + time.strftime("%H:%M:%S")) sc.stop()
path = "/home/forrest/workspace/LINE/Baselines/AMR/results/19-05-23__23-07-42__MSRParaphraseCorpus/matrix/document-concept-matrix.npz" # Load training data # training = spark.read.format("libsvm").load(path) sc = spark.sparkContext doc_conc_mtx = sparse.load_npz(path) doc_conc_mtx = doc_conc_mtx.todense() shape = doc_conc_mtx.shape indexed_doc_concept = [ IndexedRow(idx, doc_conc_mtx[idx].tolist()[0]) for idx in range(0, shape[0]) ] # indexed_sample = [IndexedRow(idx, doc_conc_list[idx]) for idx in range(0, len(sample_list))] rows = sc.parallelize(indexed_doc_concept) matrix = IndexedRowMatrix(rows) del doc_conc_mtx, indexed_doc_concept np_matrix = indexed_row_matrix_to_numpy_matrix(matrix, (11604, 14428)) print(np_matrix.shape) # svd = mtx.computeSVD(k=100)
def calculate_distance(self, sdf1, sdf2): """ This will calculate the distance between the vector-type columns of two spark dataframes :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector) :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector) :return: """ cov = RowMatrix( sdf1.select(["v1"]).withColumnRenamed("v1", "v").union( sdf2.select(["v2"]).withColumnRenamed( "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict( )["v"]))).computeCovariance().toArray() x, v = np.linalg.eigh(cov) indices = 1e-10 <= x # we are trying to enfore the data types to be only python types n = int(v.shape[0]) m = int(indices.sum()) v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()] v_spark = DenseMatrix(n, m, v_vals) x_vals = [ float(val) for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist() ] x_spark = DenseMatrix(m, m, x_vals) # we get the index to maintain the order _sdf1 = sdf1.rdd.zipWithIndex()\ .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF() _sdf1.persist() _sdf2 = sdf2.rdd.zipWithIndex()\ .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF() _sdf2.persist() # we get our indexed row matrix _sdf1_mat = IndexedRowMatrix( _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v1"])))) _sdf2_mat = IndexedRowMatrix( _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v2"])))) # we apply our transformation and then set it as our new variable _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v1=indexed_row.vector)).toDF(), "index") _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v2=indexed_row.vector)).toDF(), "index") @F.udf(DoubleType(), VectorUDT()) def tmp(vec): return float(vec[0].squared_distance(vec[1]))**0.5 all_sdf = _sdf1.crossJoin(_sdf2) dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff')) dist_sdf.persist() return dist_sdf