def compute_similarity(df): """ Compute cosine :param df:dataframe of rating by user for movies :return: """ # df = df.filter(df.movieId.isin([91542.0, 1.0, 5.0, 90.0, 2541.0, 1246.0, 1552.0, 4084.0, 5679.0])) df = df.groupBy("userId").pivot("movieId").agg( first(col('rating')).cast("double")) mat = IndexedRowMatrix( df.rdd.map(lambda row: IndexedRow(row[0], Vectors.dense(row[1:])))) cs = mat.columnSimilarities() path = "test" cs.entries.toDF().write.parquet(path) cs.entries.toDF().coalesce(1)\ .write.format("com.databricks.spark.csv")\ .option("header", "true")\ .save("testtest.csv")
from pyspark.mllib.linalg import Matrix, Matrices, DenseMatrix from pyspark.mllib.feature import Normalizer import numpy as np # setup spark context and config conf = SparkConf().setAppName("labeledPoints") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") debug = Debugger() debug.TIMESTAMP(1) spark = SparkSession(sc) data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt') data = data.map(lambda _: np.array(_.strip().split()).astype(float)) data = data.map(lambda _: _ / np.linalg.norm(_)) irmat = data.zipWithIndex().map(lambda _: IndexedRow(_[1], _[0])) irmat = IndexedRowMatrix(irmat) comat = irmat.toCoordinateMatrix() comat = comat.transpose() irmat = comat.toIndexedRowMatrix() simi = irmat.columnSimilarities() #simi_list = simi.entries.collect() #print(len(simi_list)) #print(simi_list) print(simi.entries.take(10)) debug.TIMESTAMP(2)