def elementwise_product(X: RowMatrix, Y: RowMatrix, spark): X = as_df_with_idx(X, "idx", spark) Y = as_df_with_idx(Y, "idx", spark) Y = Y.withColumnRenamed("_1", "_2") X = X.join(Y, on="idx").drop("idx") X = X.rdd.map(lambda x: scipy.array(x[0]) * scipy.array(x[1])) return X
def __init__(self, filename): ratings = spark.read.option("inferSchema", "true").option("header", "true").csv(filename) # self.num_users, self.num_items, _, _= ratings.agg(*(countDistinct(col(c)).alias(c) for c in ratings.columns)).first() self.num_users, self.num_items = ratings.groupBy( "userId").max(), ratings.groupBy("movieId").max() # msk = np.random.choice(np.arange(1,self.num_items+1), round(self.num_items*0.3), replace=False) function = udf(lambda c: c in msk, BooleanType()) newdf = ratings.withColumn('test', function(col('movieId'))).drop( col('timestamp')) self.trainDF = newdf.filter(col('test') == 'False') self.testDF = newdf.filter(col('test') == 'True') aggTup = udf(lambda x, y: x.zip(y)) rdd_tr = [ self.trainDF.rdd.map(lambda r: Vectors.sparse( self.num_items, {r.movieId: r.rating})) ] rdd_te = [ self.testDF.rdd.map(lambda r: Vectors.sparse( self.num_items, {r.movieId: r.rating})) ] # tr_rows = sc.parallelize(rdd_tr) # te_rows = sc.parallelize(rdd_te) # print(type(tr_rows)) # tr_rowsBc = sc.broadcast(tr_rows) import ipdb ipdb.set_trace() # te_rowsBc = sc.broadcast(te_rows) self.tr_mat = RowMatrix(rdd_tr) self.te_mat = RowMatrix(rdd_te)
def pca(self, df, k=1): cov = RowMatrix( df.rdd.map(lambda x: list(x))).computeCovariance().toArray() col = cov.shape[1] eigVals, eigVecs = np.linalg.eigh(cov) inds = np.argsort(eigVals) eigVecs = eigVecs.T[inds[-1:-(col + 1):-1]] eigVals = eigVals[inds[-1:-(col + 1):-1]] components = RowMatrix( df.rdd.map(lambda x: list(x))).computePrincipalComponents(k) train_data = df.rdd.map( lambda x: Row(features=Vectors.dense(x))).toDF() pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures") model = pca.fit(train_data) score = model.transform(train_data) res = { "components": components.toArray(), "score": np.array( score.select("pcaFeatures").rdd.map( lambda x: list(x[0])).collect()), "eigVectors": eigVecs, "eigValues": eigVals } return res
def compute_similarities(X, sc, threshold=0): """ Compute column similarities using Spark: Efficient dealing of sparsity with a threshold that makes sure that only relevant similarities are computed. Parameters ---------- X: an array whose features are the rows sc: SparkContext threshold: the similarity threshold Return --------- Symetric similarity matrix shape (X.shape[1], X.shape[1]) """ n = X.shape[1] rows = sc.parallelize(X) mat = RowMatrix(rows) sims = mat.columnSimilarities(threshold) # Convert to scipy sparse matrix # Each element is a Matrix entry object (i, j, value) rows_index = np.array( sims.entries.map(lambda x: x.i).collect()).astype(int) cols_index = np.array( sims.entries.map(lambda x: x.j).collect()).astype(int) values = np.array(sims.entries.map(lambda x: x.value).collect()) triang_sup = coo_matrix((values, (rows_index, cols_index)), shape=(n, n)) triang_inf = coo_matrix((values, (cols_index, rows_index)), shape=(n, n)) return ((triang_sup + triang_inf).tocsr())
def cluster(self, df, session, repartition_num=8): n = df.count() # index rows df_index = df.select((row_number().over( Window.partitionBy(lit(0)).orderBy(self.featureCol)) - 1).alias('id'), "*") df_features = df_index.select('id', self.featureCol) # prep for joining df_features = df_features.repartitionByRange(repartition_num, 'id') left_df = df_features.select( df_features['id'].alias('left_id'), df_features[self.featureCol].alias('left_features')) right_df = df_features.select( df_features['id'].alias('right_id'), df_features[self.featureCol].alias('right_features')) # join on self where left_id does not equal right_id joined_df = left_df.join(right_df, left_df['left_id'] != right_df['right_id']) # comupte cosine similarity between vectors joined_df = joined_df.select( 'left_id', 'right_id', cosine_similarity_udf( array(joined_df['left_features'], joined_df['right_features'])).alias('norm')) ranked = joined_df.select( 'left_id', 'right_id', rank().over( Window.partitionBy('left_id').orderBy('norm')).alias('rank')) knn = ranked.where(ranked['rank'] <= 5) knn_grouped = knn.groupBy('left_id').agg( f.collect_list('right_id').alias('nn')) # generate laplacian laplacian = knn_grouped.select( 'left_id', laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'], lit(n), lit(self.k_nearest)).alias('lap_vector')) laplacian_matrix = RowMatrix( laplacian.select('lap_vector').rdd.map(lambda x: x[0])) eigenvectors = laplacian_matrix.computePrincipalComponents( k=self.num_eigenvectors) eigenvectors = [ (idx, Vectors.dense([float(item) for item in row])) for idx, row in enumerate(eigenvectors.toArray().tolist()) ] eigen_df = session.createDataFrame(eigenvectors, ['id', self.featureCol]) model = KMeans(featuresCol=self.featureCol, predictionCol=self.predictionCol, k=self.k).fit(eigen_df) predictions = model.transform(eigen_df).join(df_index, on='id') return predictions
def test_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(Vectors.dense(1))]) row_matrix = RowMatrix(df) self.assertEqual(row_matrix.numRows(), 1) self.assertEqual(row_matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): RowMatrix(df.selectExpr("'monkey'"))
def within_group_scatter(data: pyspark.sql.DataFrame, features, response, targets): p = len(features) sw = numpy.zeros((p, p)) for target in targets: df_t = data.filter("{} == '{}'".format(response, target)) X_t = RowMatrix(df_t.select(features).rdd.map(numpy.array)) sw += X_t.computeCovariance().toArray() * (df_t.count() - 1) return sw
def join(data: sql.DataFrame, X: RowMatrix, spark, on=FEATURES__): as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT()) X = spark.createDataFrame(X.rows.map(lambda x: (x,))) X = X.withColumnRenamed("_1", on) X = X.withColumn(on, as_ml(on)) ri = "row_index" X = X.withColumn(ri, func.monotonically_increasing_id()) data = data.withColumn(ri, func.monotonically_increasing_id()) data = data.join(X[ri, on], on=[ri]).drop(ri) return data
def main(): datasetfile = sys.argv[1] beta = 0.8 iterations = 40 top_k = 5 sparkcontext = SparkContext("local", "Page Rank") data = sparkcontext.textFile(datasetfile) source_dest = data.map(make_key_value_pair_1) source_dest_count = data.map(make_key_value_pair_2) groupbykey = source_dest.groupByKey() number_of_nodes = groupbykey.count() out_degree = groupbykey.map(calc_out_degree) pair_map = groupbykey.collectAsMap() matrix_m = np.zeros(shape=(number_of_nodes, number_of_nodes)) for key, value in pair_map.items(): for ind_value in value: matrix_m[ind_value - 1][key - 1] += 1 / len(list(value)) matrix_m = sparkcontext.parallelize(matrix_m) matrix_m = RowMatrix(matrix_m) vector_r_prev = np.empty([number_of_nodes, 1]) vector_r_prev.fill(1 / number_of_nodes) vector_r_prev = DenseMatrix(number_of_nodes, 1, vector_r_prev) index = 0 while (index < iterations): mul_val = matrix_m.multiply(vector_r_prev).rows.collect() mul_val = [i * beta for i in mul_val] mul_val = [i + (1 - beta) / number_of_nodes for i in mul_val] vector_r_prev = DenseMatrix(number_of_nodes, 1, mul_val) index += 1 vector_r_prev = vector_r_prev.toArray() largest_values = heapq.nlargest(top_k, vector_r_prev) largest_indexes = heapq.nlargest(top_k, range(number_of_nodes), vector_r_prev.__getitem__) smallest_values = heapq.nsmallest(top_k, vector_r_prev) smallest_indexes = heapq.nsmallest(top_k, range(number_of_nodes), vector_r_prev.__getitem__) largest_indexes = [val + 1 for val in largest_indexes] smallest_indexes = [val + 1 for val in smallest_indexes] print("Value of largest n nodes\n", largest_values) print("Node numbers of largest n nodes\n", largest_indexes) print("Value of smallest n nodes\n", smallest_values) print("Node numbers of smallest n nodes\n", smallest_indexes) sparkcontext.stop()
def get_svd_U(tfidf_rdd, n_topics=3): # distributed matrix matrix_rdd = RowMatrix(tfidf_rdd) matrix_rdd.numRows # matrix_rdd.rows.take(3) svd = matrix_rdd.computeSVD(3, computeU=True) # left singular vectors # type = RowMatrix svd_u = svd.U # array of DenseVectors, m_documents x n_topics # [[topic_i, ...], ...] return svd_u.rows.collect()
def distribution_data(): vectors = data.map(lambda p: p.features) """ 通过数据的每一行构成RowMatrix """ matrix = RowMatrix(vectors) matrixSummary = matrix.computeColumnSummaryStatistics() print "mean of each column:" print matrixSummary.mean() print "min of each column:" print matrixSummary.min() print "max of each column:" print matrixSummary.max() print "variance of each column:" print matrixSummary.variance()
def similarity_processing(self, tag_path): conf = SparkConf().setAppName("Test").setMaster("local") sc = SparkContext(conf=conf) spark = SparkSession.builder.config(conf=conf).getOrCreate() df = spark.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load(tag_path, header=True) df = df.drop("tagId") print(df.columns) rdd = df.rdd.map(list) mat = RowMatrix(rdd) print(mat.numCols(), mat.numRows()) cs = mat.columnSimilarities() for x in cs.entries.collect(): print(x) print(cs.numRows(), cs.numCols())
def spark_abs(M1, sc): asarr = lambda x: x.toArray().tolist() M = M1.rows.collect() V1 = list(map(asarr, M)) L = np.abs(V1).tolist() return RowMatrix(sc.parallelize(L))
def U(self): """ Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True. """ u = self.call("U") if u is not None: return RowMatrix(u)
def spark_sub(M1, M2, sc): V1 = M1.rows.collect() V2 = M2.rows.collect() lsub = lambda x1, x2: x1 - x2 V3 = list(map(lsub, V1, V2)) return RowMatrix(sc.parallelize(V3))
def _preprocess_data(self, data): X = self._feature_matrix(data) n = X.count() self.__means, var = column_statistics(X) var = var * (n - 1) / n X = RowMatrix(center(X, means=self.__means)) return X, self.__means, var
def _preprocess_data(self, data): if isinstance(data, pyspark.sql.DataFrame): X = self._feature_matrix(data) else: X = data.rows X, self.__means, self.__vars = scale(X) return RowMatrix(X)
def test_pca(self): expected_pcs = array([[0.0, 1.0, 0.0], [sqrt(2.0) / 2.0, 0.0, sqrt(2.0) / 2.0], [sqrt(2.0) / 2.0, 0.0, -sqrt(2.0) / 2.0]]) n = 3 denseMat = RowMatrix(self.sc.parallelize(self.denseData)) sparseMat = RowMatrix(self.sc.parallelize(self.sparseData)) for mat in [denseMat, sparseMat]: for k in range(1, 4): pcs = mat.computePrincipalComponents(k) self.assertEqual(pcs.numRows, n) self.assertEqual(pcs.numCols, k) # We can just test the updated principal component for equality. self.assertEqualUpToSign(pcs.toArray()[:, k - 1], expected_pcs[:, k - 1])
def plot_pca(myrdd, title, color): mat = RowMatrix(myrdd) pc = mat.computePrincipalComponents(2) # Project the rows to the linear space spanned by the top 2 principal components. projected = mat.multiply(pc) a = projected.rows.collect() sum_pca1 = 0 sum_pca2 = 0 for i in a: sum_pca1 = sum_pca1 + i[0] sum_pca2 = sum_pca2 + i[1] plt.plot(i[0], i[1], 'o', color=color) ave_pca1 = sum_pca1 / len(a) ave_pca2 = sum_pca2 / len(a) plt.plot(ave_pca1, ave_pca2, '^', markersize=10, color='red') plt.title(title) plt.show()
def _transform(self, dataset): sc = SparkContext.getOrCreate() #Get spectral clustering projecction P = self.getProjection() #Get data x = dataset.select(self.getFeaturesCol()) rdd2 = x.rdd.map(list) #Get data adopted to calculate projection rdd = self.getPrevdata() #Calculate distance between new data and "training one" Aarr = self._dist_matrix(rdd, rdd2, sc) Arm = RowMatrix(sc.parallelize(Aarr)) #Transform new data result = Arm.multiply(P) df = result.rows.map(lambda x: Row(x.toArray().tolist())).toDF() return df.withColumnRenamed("_1", "projection")
def test_row_matrix_invalid_type(self): rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]]) invalid_type = "" matrix = RowMatrix(rows) self.assertRaises(TypeError, matrix.multiply, invalid_type) irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])]) imatrix = IndexedRowMatrix(irows) self.assertRaises(TypeError, imatrix.multiply, invalid_type)
def set_elem(rowMatrix, i, j, value): n = rowMatrix.numRows() listOfElems = [ rowMatrix.rows.collect()[my_iter].toArray() for my_iter in range(n) ] a = np.array(listOfElems) np.put(a, i * n + j, value) return RowMatrix(sc.parallelize(a), n, n)
def fourier(X: RowMatrix, n_features, seed=23, gamma=1): p = X.numCols() random_state = numpy.random.RandomState(seed) w = numpy.sqrt(2 * gamma) * random_state.normal(size=(p, n_features)) w = DenseMatrix(p, n_features, w.flatten(), isTransposed=True) b = random_state.uniform(0, 2 * numpy.pi, size=n_features) Y = fourier_transform(X, w, b) return Y, w, b
def svd(data: RowMatrix, n_components=None): """ Computes a singular value decomposition on a data matrix and the variance that is explained by the first n_components. :param data: a data frame :param n_components: number of components to be returned :return: returns the estimated components of a SVD. :rtype: a triple of (s, V, var) """ logger.info("Computing SVD") svd = data.computeSVD(data.numCols(), computeU=False) s = svd.s.toArray() V = svd.V.toArray().T var = scipy.dot(s, s) if n_components is not None: var = scipy.dot(s[n_components:], s[n_components:]) s, V = s[:n_components], V[:n_components] return s, V, var
def singular_value_decomposition(self, n_components): # mat = RowMatrix(self.spark_context.parallelize(np.asarray(self.tfidf.select( # 'id', 'features').rdd.map(lambda row: row[1].toArray()).collect()).T)) rdd = self.tfidf.select( 'id', 'features').rdd.map(lambda row: row[1].toArray()) rdd.persist(ps.StorageLevel.MEMORY_AND_DISK) mat = RowMatrix(self.rdd_transpose(rdd)) # rdd_ = self.tfidf.select( # 'id', 'features').rdd # rdd_ = rdd_.map(lambda row: row[1].toArray()) svd = computeSVD(mat, n_components, computeU=True) print svd.U.numCols(), svd.U.numRows() print type(svd.V) self.vt = svd.V self.similarity_matrix = cosine_similarity(svd.V.toArray()) self.five_most_similar_beers = self.sql_context.createDataFrame( map(lambda x: np.argsort(x)[::-1][:6].tolist(), self.similarity_matrix), ['id', 'first', 'second', 'third', 'fourth', 'fifth']) self.tfidf = self.tfidf.join(self.five_most_similar_beers, ['id'], 'inner') self.token, self.db = database.connect_to_database() # use default arguments to avoid closure of the environment of the token and db variables def save_to_firebase(x, token=self.token, db=self.db): data = { 'brewery_name': x.brewery_name, 'beer_name': x.beer_name, 'state': x.state, 'beer_style': x.beer_style, 'first': x.first, 'second': x.second, 'third': x.third, 'fourth': x.fourth, 'fifth': x.fifth, 'top1': x.top1, 'top2': x.top2, 'top3': x.top3, 'top4': x.top4, 'top5': x.top5, 'top6': x.top6, 'top7': x.top7 } # name = {'brewery_name': x.brewery_name, 'beer_name': x.beer_name} db.child('beers').child(x.id).set(data, token) # db.child('beer_names').child(x.id).set(name, token) # sleep.(0.1) self.tfidf.rdd.foreach(lambda x: save_to_firebase(x))
def add(self,vector): if count_nonzero(vector) == 0: return # If the approximate matrix is full, call the operate method to free half of the columns if self.emptyRows <= 0: self.svd = self.distributedSketchMatrix.computeSVD(self.rows, computeU=False) #self.U = self.svd.U # The U factor is a distributed RowMatrix. #self.S[:] = self.svd.s[:] # The singular values are stored in a local dense vector. self.S = self.svd.s.array # The singular values are stored in a local dense vector. self.S.flags.writeable = True self.V = self.svd.V # The V factor is a local dense matrix. self.reduceRank() # Push the new vector to the next zero row and increase the next zero row index self.localSketchMatrix[self.nextZeroRow,:] = vector del(self.distributedSketchMatrix) self.distributedSketchMatrix = RowMatrix(self.sc.parallelize(self.localSketchMatrix)) self.nextZeroRow += 1 self.emptyRows -= 1
def similarity(feature_vecs, columnSimilarities_threshold): # transpose `prod_features_rdd` def transpose(rm): cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) return cm.transpose().toRowMatrix() rowmat = RowMatrix(feature_vecs) colmat = transpose(rowmat) sims = colmat.columnSimilarities(columnSimilarities_threshold) return sims
def lu_factorization(A): n = A.numRows() L = RowMatrix(sc.parallelize(np.eye(n)), n, n) U = RowMatrix(sc.parallelize(np.zeros((n, n))), n, n) for k in range(0, n): for i in range(k + 1, n): L = set_elem(L, i, k, get_elem(A, i, k) / get_elem(A, k, k)) for j in range(k, n): U = set_elem(U, k, j, get_elem(A, k, j)) for i in range(k + 1, n): for j in range(k + 1, n): A = set_elem( A, i, j, get_elem(A, i, j) - get_elem(L, i, k) * get_elem(U, k, j)) return L, U
def main(): if len(sys.argv) < 2: print('USAGE: lu_factorization.py <dim of matrix>') return n = int(sys.argv[1]) rows = sc.parallelize(np.random.randint(n * n, size=(n, n))) mat = RowMatrix(rows, n, n) L, U = lu_factorization(mat) print('**************finish LU Factorization!')
def __init__(self ,sc , rows, columns, op='fd'): """ Matrix Sketching using Frequent Direction. Choose 'fd' for normal Frequent Direction, 'ssd' for Space Saving Direction, 'cfd' for Compensative Frequent Direction, 'isvd' for iterative SVD, and a number between 0 and 1 for Parameterized Frequent Direction """ self.class_name = 'MatrixSketching' self.sc = sc self.op = op self.columns = columns self.rows = rows self.localSketchMatrix = zeros((self.rows, self.columns)) self.distributedSketchMatrix = RowMatrix(self.sc.parallelize(self.localSketchMatrix)) self.S = zeros(self.rows) self.U = [] self.V = [] self.step = 1 self.nextZeroRow = 0 self.emptyRows = self.rows # Parsing the operation parameter if self.op == 'fd': print("Matrix Sketching Using Frequent Direction") self.reduceRank = self.__FDOperate__ elif self.op == 'ssd': print("Matrix Sketching Using Space Saving Direction") self.op = 2 self.reduceRank = self.__SSDOperate__ elif self.op == 'cfd': print("Matrix Sketching Using Compensative Frequent Direction") self.reduceRank = self.__CFDperate__ elif self.op == 'isvd': print("Matrix Sketching Using iSVD") self.reduceRank = self.__iSVDOperate__ elif type(self.op) != str and self.op > 0 and self.op < 1: print("Matrix Sketching Using Parameterized Frequent Direction") self.reduceRank = self.__PFDOperate__ self.DELTA = 0 else: print("Type of Reduce Rank algorithm is not correct") raise ValueError
def test_svd(self): denseMat = RowMatrix(self.sc.parallelize(self.denseData)) sparseMat = RowMatrix(self.sc.parallelize(self.sparseData)) m = 4 n = 3 for mat in [denseMat, sparseMat]: for k in range(1, 4): rm = mat.computeSVD(k, computeU=True) self.assertEqual(rm.s.size, k) self.assertEqual(rm.U.numRows(), m) self.assertEqual(rm.U.numCols(), k) self.assertEqual(rm.V.numRows, n) self.assertEqual(rm.V.numCols, k) # Test that U returned is None if computeU is set to False. self.assertEqual(mat.computeSVD(1).U, None) # Test that low rank matrices cannot have number of singular values # greater than a limit. rm = RowMatrix(self.sc.parallelize(tile([1, 2, 3], (3, 1)))) self.assertEqual(rm.computeSVD(3, False, 1e-6).s.size, 1)
def svd(mat, k=1000): matRow = RowMatrix(mat) matSVD = matRow.computeSVD(k=k, computeU=True) return matSVD
# In[66]: createVector(sampleItem) # In[67]: tfidfVector.persist(StorageLevel.MEMORY_AND_DISK) tfidfVector.count() docVect.unpersist() # In[68]: #/* Constructing row matrix for terms and metaData of each video */ mat = RowMatrix(tfidfVector.values()) m = mat.numRows()# /* Number of rows in a matrix */ n= mat.numCols()# /* Number of columns in a matrix */ #/* Computing svd from the 'mat' to obtain matrices*/ # svd = mat.computeSVD(30, computeU=true) # In[69]: type(mat) # In[70]: # http://stackoverflow.com/questions/33428589/pyspark-and-pca-how-can-i-extract-the-eigenvectors-of-this-pca-how-can-i-calcu/33500704#33500704 from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
# $example on$ from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg.distributed import RowMatrix # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonSVDExample") # $example on$ rows = sc.parallelize([ Vectors.sparse(5, {1: 1.0, 3: 7.0}), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ]) mat = RowMatrix(rows) # Compute the top 5 singular values and corresponding singular vectors. svd = mat.computeSVD(5, computeU=True) U = svd.U # The U factor is a RowMatrix. s = svd.s # The singular values are stored in a local dense vector. V = svd.V # The V factor is a local dense matrix. # $example off$ collected = U.rows.collect() print("U factor is:") for vector in collected: print(vector) print("Singular values are: %s" % s) print("V factor is:\n%s" % V) sc.stop()
from pyspark import SparkContext # $example on$ from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg.distributed import RowMatrix # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonPCAOnRowMatrixExample") # $example on$ rows = sc.parallelize([ Vectors.sparse(5, {1: 1.0, 3: 7.0}), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ]) mat = RowMatrix(rows) # Compute the top 4 principal components. # Principal components are stored in a local dense matrix. pc = mat.computePrincipalComponents(4) # Project the rows to the linear space spanned by the top 4 principal components. projected = mat.multiply(pc) # $example off$ collected = projected.rows.collect() print("Projected Row Matrix of principal component:") for vector in collected: print(vector) sc.stop()
# -*- coding:utf-8 -*- # author [email protected] import os import sys from pyspark import SparkContext local_path = os.path.dirname(__file__) sys.path.append(local_path + "/../lib") sys.path.append(local_path + "/../") from pyspark.mllib.linalg import Vector from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg.distributed import RowMatrix def main(sc, sqlContext, isHive = True): pass if __name__ == "__main__": os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6" sc = SparkContext('local[1]') rddRows = sc.parallelize(["1 0 2 0 0 1", "0 0 4 2 0 0"]) rddRows.map(lambda x: Vectors.dense([float(each) for each in str(x).split(" ")])) mat = RowMatrix(rddRows) simsPerfect = mat.columnSimilarities()