lambda x: x[1] > 4).toDF() reviews_dataframe = reviews_data.join(review_count_per_user, 'user_index', 'leftsemi') (training, test) = reviews_dataframe.randomSplit([0.8, 0.2]) print('########################## Training ###########################') als = ALS(userCol="user_index", itemCol="business_index", ratingCol="stars", coldStartStrategy="drop") als.setSeed(123) # Setting parameters for grid builder grid = ParamGridBuilder().addGrid(als.maxIter, [20]).addGrid( als.rank, [20, 30, 40, 50, 60, 70]).addGrid(als.regParam, [0.45, 0.5, 0.55]).build() evaluator = RegressionEvaluator(predictionCol=als.getPredictionCol(), labelCol=als.getRatingCol(), metricName='rmse') cv = CrossValidator(estimator=als, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) cvModel = cv.fit(training) cvModel.save('E:\Big_data_project\model\collab_montreal_model\\bestModel') predictions = cvModel.transform(test) predictions.cache() print('########################## Computing RMSE ###########################') rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
error=self.rmse(dataset,self.predictionCol,self.targetCol) print ("Error: {}".format(error)) return error def isLargerBetter(self): return False @staticmethod def rmse(dataset,predictionCol,targetCol): return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count())) lr1 = ALS() grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build() evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol()) cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2) cvModel1 = cv1.fit(dfRatings) a=cvModel1.transform(dfRatings) error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol()) print ('ERROR de validacion: {}'.format(error_cross_validation)) error_models=[] for reg_param in (1.0,0.5,2.0): lr = ALS(regParam=reg_param) model = lr.fit(dfRatings) error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol()) error_models.append(error) print ('reg_param: {}, rmse: {}'.format(reg_param,error)) import numpy as np