def als_grid_search_ranking(train, val, maxIters, regParams, ranks): ''' GRID SEARCH FOR ALS Params: train: train set val: validation set maxIters: list of maxIter regParams: list of regParams ranks: list ofranks return: models: dic of models where key is its parameter and value is the ALS obj precision_at_k_scores: dic of precision_at_k_scores maps: dic of mean avearage precision NDCGs: dict of NDCG scores times: dict of time to run diff models ''' models = {} precision_at_k_scores = {} maps = {} NDCGs = {} times = {} sc = SparkContext.getOrCreate() spark = SparkSession.builder.appName("try").getOrCreate() # grid-search for r in ranks: for Iter in maxIters: for reg in regParams: st = time() # initialize and train model model = ALS(rank = r,maxIter=Iter, regParam=reg,userCol='user_id',\ itemCol='book_id',ratingCol='rating',coldStartStrategy='drop',nonnegative=True) model = model.fit(train) models[(r, Iter, reg)] = model # evaluate on validation preds = model.recommendForAllUsers(500) preds.createOrReplaceTempView('preds') val = spark.sql( 'SELECT user_id, book_id FROM val SORT BY rating DESC') val = val.groupBy('user_id').agg( collect_list('book_id').alias('book_id_val')) val.createOrReplaceTempView('val') predAndTruth = spark.sql( 'SELECT preds.recommendations, val.book_id_val FROM val join preds on preds.user_id = val.user_id' ) predAndTruth = predAndTruth.collect() final_predAndTruth = [] for item in predAndTruth: truth = item[1] pred = [i.book_id for i in item[0]] final_predAndTruth += [(pred, truth)] final_predAndTruth = sc.parallelize(final_predAndTruth) ranking_obj = RankingMetrics(final_predAndTruth) precision_at_k_scores[(r, Iter, reg)] = ranking_obj.precisionAt(500) maps[(r, Iter, reg)] = ranking_obj.meanAveragePrecision NDCGs[(r, Iter, reg)] = ranking_obj.ndcgAt(500) times[(r, Iter, reg)] = round(time() - st, 5) print('Model with maxIter = {}, reg = {}, rank = {} complete'. format(Iter, reg, r)) return models, precision_at_k_scores, maps, NDCGs, times
ProductsDF = spark.read.csv(pathToFile, inferSchema = True, header = True) ProductsDF.printSchema() #Reading the metadata file pathToFile = 'Products_meta_preprocessed.csv' metadataDF = spark.read.csv(pathToFile, inferSchema = True, header = True) metadataDF.printSchema() #cross validation on the products ratings data cvs_model = cvs.fit(ProductsDF) #defining best model recsys = cvs_model.bestModel #Getting top 10 recommendations for all users userRecs = recsys.recommendForAllUsers(10) userRecs.show() #printing schema for the recommendation engine userRecs.printSchema() #itertaing list of lists for recommedations df = userRecs.toPandas() list = df.values.tolist() print(list[3]) #Converting list to dataframe import numpy as np new =[] dfi = pd.DataFrame(columns = ['a','b','c']) for i in list:
class RecommendationEngineALS: def __init__(self): self.spark = SparkSession \ .builder \ .master("local") \ .appName("Recommend") \ .config('spark.mongodb.input.uri', 'mongodb+srv://Monika:[email protected]/test?retryWrites=true&w=majority&authSource=admin') \ .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.4.1') \ .getOrCreate() self.ratingDF = self.__load_to_df().select('reviewer_id', 'listing_id', 'rating') self.model = ALS(seed=1, nonnegative=True, userCol="reviewer_id", itemCol="listing_id", ratingCol="rating", coldStartStrategy="drop") def __load_to_df(self): ratings_df = self.spark.read \ .format("com.mongodb.spark.sql.DefaultSource") \ .option("spark.mongodb.input.uri", "mongodb+srv://Monika:[email protected]/test?retryWrites=true&w=majority&authSource=admin") \ .option("database", "recommendation_system") \ .option("collection", "reviews") \ .load() return ratings_df def tune_ALS(self, model, train_data, validation_data, maxIter, regParams, ranks): """ grid search function to select the best model based on RMSE of validation data Parameters ---------- model: spark ML model, ALS train_data: spark DF with columns ['reviewer_id', 'listing_id', 'rating'] validation_data: spark DF with columns ['reviewer_id', 'listing_id', 'rating'] maxIter: int, max number of learning iterations regParams: list of float, one dimension of hyper-param tuning grid ranks: list of float, one dimension of hyper-param tuning grid Return ------ The best fitted ALS model with lowest RMSE score on validation data """ # initial min_error = float('inf') best_rank = -1 best_regularization = 0 best_model = None for rank in ranks: for reg in regParams: # get ALS model als = model.setMaxIter(maxIter).setRank(rank).setRegParam(reg) # train ALS model model_f = als.fit(train_data) # evaluate the model by computing the RMSE on the validation data predictions = model_f.transform(validation_data) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print('{} latent factors and regularization = {}: ' 'validation RMSE is {}'.format(rank, reg, rmse)) if rmse < min_error: min_error = rmse best_rank = rank best_regularization = reg best_model = model_f print('\nThe best model has {} latent factors and ' 'regularization = {}'.format(best_rank, best_regularization)) return best_model def tune_model(self, maxIter, regParams, ranks, split_ratio=[0.8, 0.1, 0.1]): """ Hyperparameter tuning for ALS model Parameters ---------- maxIter: int, max number of learning iterations regParams: list of float, regularization parameter ranks: list of float, number of latent factors split_ratio: tuple, (train, validation, test) """ # split the data set between a training data set,validation and test set. (trainingData, validationData, testData) = self.ratingDF.randomSplit(split_ratio, seed=1) self.model = self.tune_ALS(self.model, trainingData, validationData, maxIter, regParams, ranks) prediction = self.model.transform(testData) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") evaluator2 = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(prediction) mae = evaluator2.evaluate(prediction) print('RMSE is %s' % rmse) print('MAE is %s' % mae) def get_top3_recommendations(self, maxIter=12, regParam=[0.1, 0.2], ranks=[8, 12, 16]): self.tune_model(maxIter, regParam, ranks) userRecommendations = self.model.recommendForAllUsers(3).toPandas() return userRecommendations