lambda x: x[1] > 4).toDF()
reviews_dataframe = reviews_data.join(review_count_per_user, 'user_index',
                                      'leftsemi')

(training, test) = reviews_dataframe.randomSplit([0.8, 0.2])
print('########################## Training ###########################')
als = ALS(userCol="user_index",
          itemCol="business_index",
          ratingCol="stars",
          coldStartStrategy="drop")
als.setSeed(123)
# Setting parameters for grid builder
grid = ParamGridBuilder().addGrid(als.maxIter, [20]).addGrid(
    als.rank, [20, 30, 40, 50, 60, 70]).addGrid(als.regParam,
                                                [0.45, 0.5, 0.55]).build()
evaluator = RegressionEvaluator(predictionCol=als.getPredictionCol(),
                                labelCol=als.getRatingCol(),
                                metricName='rmse')
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5)
cvModel = cv.fit(training)

cvModel.save('E:\Big_data_project\model\collab_montreal_model\\bestModel')
predictions = cvModel.transform(test)
predictions.cache()

print('########################## Computing RMSE ###########################')

rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
        error=self.rmse(dataset,self.predictionCol,self.targetCol)
        print ("Error: {}".format(error))
        return error
    
    def isLargerBetter(self):
        return False
    
    @staticmethod
    def rmse(dataset,predictionCol,targetCol):
        return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count()))


    
lr1 = ALS()
grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build()
evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol())
cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2)
cvModel1 = cv1.fit(dfRatings)
a=cvModel1.transform(dfRatings)
error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol())
print ('ERROR de validacion: {}'.format(error_cross_validation))

error_models=[]
for reg_param in (1.0,0.5,2.0):
    lr = ALS(regParam=reg_param)
    model = lr.fit(dfRatings)
    error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol())
    error_models.append(error)
    print ('reg_param: {}, rmse: {}'.format(reg_param,error))
    
import numpy as np