def compute_similarity(df):
    """
    Compute cosine
    :param df:dataframe of rating by user for movies
    :return:
    """

    # df = df.filter(df.movieId.isin([91542.0, 1.0, 5.0, 90.0, 2541.0, 1246.0, 1552.0, 4084.0, 5679.0]))

    df = df.groupBy("userId").pivot("movieId").agg(
        first(col('rating')).cast("double"))

    mat = IndexedRowMatrix(
        df.rdd.map(lambda row: IndexedRow(row[0], Vectors.dense(row[1:]))))

    cs = mat.columnSimilarities()

    path = "test"

    cs.entries.toDF().write.parquet(path)

    cs.entries.toDF().coalesce(1)\
       .write.format("com.databricks.spark.csv")\
       .option("header", "true")\
       .save("testtest.csv")
from pyspark.mllib.linalg import Matrix, Matrices, DenseMatrix
from pyspark.mllib.feature import Normalizer
import numpy as np
# setup spark context and config
conf = SparkConf().setAppName("labeledPoints")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

debug = Debugger()
debug.TIMESTAMP(1)
spark = SparkSession(sc)

data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt')
data = data.map(lambda _: np.array(_.strip().split()).astype(float))
data = data.map(lambda _: _ / np.linalg.norm(_))
irmat = data.zipWithIndex().map(lambda _: IndexedRow(_[1], _[0]))
irmat = IndexedRowMatrix(irmat)

comat = irmat.toCoordinateMatrix()
comat = comat.transpose()

irmat = comat.toIndexedRowMatrix()
simi = irmat.columnSimilarities()

#simi_list = simi.entries.collect()
#print(len(simi_list))
#print(simi_list)
print(simi.entries.take(10))

debug.TIMESTAMP(2)